1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* At first, see the document in `character.h' to understand the code
37 #include <sys/types.h>
39 #include "character.h"
42 #include "composite.h"
51 Lisp_Object Qcharacterp
;
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector
;
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars
;
60 Lisp_Object Qauto_fill_chars
;
62 Lisp_Object Vchar_unify_table
;
64 /* A char-table. An element is non-nil iff the corresponding
65 character has a printable glyph. */
66 Lisp_Object Vprintable_chars
;
68 /* A char-table. An elemnent is a column-width of the corresponding
70 Lisp_Object Vchar_width_table
;
72 /* A char-table. An element is a symbol indicating the direction
73 property of corresponding character. */
74 Lisp_Object Vchar_direction_table
;
76 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
77 unsigned char *_fetch_multibyte_char_p
;
78 int _fetch_multibyte_char_len
;
80 /* Char table of scripts. */
81 Lisp_Object Vchar_script_table
;
83 static Lisp_Object Qchar_script_table
;
89 char_string_with_unification (c
, p
)
97 if (c
<= MAX_3_BYTE_CHAR
|| c
> MAX_5_BYTE_CHAR
)
99 bytes
= CHAR_STRING (c
, p
);
101 else if (c
<= MAX_4_BYTE_CHAR
)
103 p
[0] = (0xF0 | (c
>> 18));
104 p
[1] = (0x80 | ((c
>> 12) & 0x3F));
105 p
[2] = (0x80 | ((c
>> 6) & 0x3F));
106 p
[3] = (0x80 | (c
& 0x3F));
112 p
[1] = (0x80 | ((c
>> 18) & 0x0F));
113 p
[2] = (0x80 | ((c
>> 12) & 0x3F));
114 p
[3] = (0x80 | ((c
>> 6) & 0x3F));
115 p
[4] = (0x80 | (c
& 0x3F));
124 string_char_with_unification (p
, advanced
, len
)
125 const unsigned char *p
;
126 const unsigned char **advanced
;
130 const unsigned char *saved_p
= p
;
132 if (*p
< 0x80 || ! (*p
& 0x20) || ! (*p
& 0x10))
134 c
= STRING_CHAR_ADVANCE (p
);
136 else if (! (*p
& 0x08))
138 c
= ((((p
)[0] & 0xF) << 18)
139 | (((p
)[1] & 0x3F) << 12)
140 | (((p
)[2] & 0x3F) << 6)
146 c
= ((((p
)[1] & 0x3F) << 18)
147 | (((p
)[2] & 0x3F) << 12)
148 | (((p
)[3] & 0x3F) << 6)
153 MAYBE_UNIFY_CHAR (c
);
163 /* Translate character C by translation table TABLE. If C is
164 negative, translate a character specified by CHARSET and CODE. If
165 no translation is found in TABLE, return the untranslated
169 translate_char (table
, c
)
175 if (! CHAR_TABLE_P (table
))
177 ch
= CHAR_TABLE_REF (table
, c
);
178 if (! CHARACTERP (ch
))
183 /* Convert the unibyte character C to the corresponding multibyte
184 character based on the current value of charset_unibyte. If C
185 can't be converted, return C. */
188 unibyte_char_to_multibyte (c
)
191 struct charset
*charset
= CHARSET_FROM_ID (charset_unibyte
);
192 int c1
= DECODE_CHAR (charset
, c
);
194 return ((c1
>= 0) ? c1
: c
);
198 /* Convert the multibyte character C to unibyte 8-bit character based
199 on the current value of charset_unibyte. If dimension of
200 charset_unibyte is more than one, return (C & 0xFF).
202 The argument REV_TBL is now ignored. It will be removed in the
206 multibyte_char_to_unibyte (c
, rev_tbl
)
210 struct charset
*charset
= CHARSET_FROM_ID (charset_unibyte
);
211 unsigned c1
= ENCODE_CHAR (charset
, c
);
213 return ((c1
!= CHARSET_INVALID_CODE (charset
)) ? c1
: c
& 0xFF);
217 DEFUN ("characterp", Fcharacterp
, Scharacterp
, 1, 2, 0,
218 doc
: /* Return non-nil if OBJECT is a character. */)
220 Lisp_Object object
, ignore
;
222 return (CHARACTERP (object
) ? Qt
: Qnil
);
225 DEFUN ("max-char", Fmax_char
, Smax_char
, 0, 0, 0,
226 doc
: /* Return the character of the maximum code. */)
229 return make_number (MAX_CHAR
);
232 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
233 Sunibyte_char_to_multibyte
, 1, 1, 0,
234 doc
: /* Convert the unibyte character CH to multibyte character.
235 The multibyte character is a result of decoding CH by
236 the current primary charset (value of `charset-primary'). */)
241 struct charset
*charset
;
243 CHECK_CHARACTER (ch
);
246 error ("Invalid unibyte character: %d", c
);
247 charset
= CHARSET_FROM_ID (charset_unibyte
);
248 c
= DECODE_CHAR (charset
, c
);
250 c
= BYTE8_TO_CHAR (XFASTINT (ch
));
251 return make_number (c
);
254 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
255 Smultibyte_char_to_unibyte
, 1, 1, 0,
256 doc
: /* Convert the multibyte character CH to unibyte character.\n\
257 The unibyte character is a result of encoding CH by
258 the current primary charset (value of `charset-primary'). */)
264 CHECK_CHARACTER (ch
);
266 c
= CHAR_TO_BYTE8 (c
);
267 return make_number (c
);
270 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
271 doc
: /* Return 1 regardless of the argument CHAR.
272 This is now an obsolete function. We keep it just for backward compatibility. */)
276 CHECK_CHARACTER (ch
);
277 return make_number (1);
280 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
281 doc
: /* Return width of CHAR when displayed in the current buffer.
282 The width is measured by how many columns it occupies on the screen.
283 Tab is taken to occupy `tab-width' columns. */)
289 struct Lisp_Char_Table
*dp
= buffer_display_table ();
291 CHECK_CHARACTER (ch
);
294 /* Get the way the display table would display it. */
295 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
298 width
= ASIZE (disp
);
300 width
= CHAR_WIDTH (c
);
302 return make_number (width
);
305 /* Return width of string STR of length LEN when displayed in the
306 current buffer. The width is measured by how many columns it
307 occupies on the screen. If PRECISION > 0, return the width of
308 longest substring that doesn't exceed PRECISION, and set number of
309 characters and bytes of the substring in *NCHARS and *NBYTES
313 c_string_width (str
, len
, precision
, nchars
, nbytes
)
315 int precision
, *nchars
, *nbytes
;
317 int i
= 0, i_byte
= 0;
319 struct Lisp_Char_Table
*dp
= buffer_display_table ();
323 int bytes
, thiswidth
;
325 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
329 val
= DISP_CHAR_VECTOR (dp
, c
);
331 thiswidth
= XVECTOR (val
)->size
;
333 thiswidth
= CHAR_WIDTH (c
);
337 thiswidth
= CHAR_WIDTH (c
);
341 && (width
+ thiswidth
> precision
))
361 /* Return width of string STR of length LEN when displayed in the
362 current buffer. The width is measured by how many columns it
363 occupies on the screen. */
370 return c_string_width (str
, len
, -1, NULL
, NULL
);
373 /* Return width of Lisp string STRING when displayed in the current
374 buffer. The width is measured by how many columns it occupies on
375 the screen while paying attention to compositions. If PRECISION >
376 0, return the width of longest substring that doesn't exceed
377 PRECISION, and set number of characters and bytes of the substring
378 in *NCHARS and *NBYTES respectively. */
381 lisp_string_width (string
, precision
, nchars
, nbytes
)
383 int precision
, *nchars
, *nbytes
;
385 int len
= XSTRING (string
)->size
;
386 unsigned char *str
= XSTRING (string
)->data
;
387 int i
= 0, i_byte
= 0;
389 struct Lisp_Char_Table
*dp
= buffer_display_table ();
393 int chars
, bytes
, thiswidth
;
398 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
399 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
402 thiswidth
= composition_table
[cmp_id
]->width
;
404 bytes
= string_char_to_byte (string
, end
) - i_byte
;
408 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
411 val
= DISP_CHAR_VECTOR (dp
, c
);
413 thiswidth
= XVECTOR (val
)->size
;
415 thiswidth
= CHAR_WIDTH (c
);
419 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
422 thiswidth
= CHAR_WIDTH (c
);
426 && (width
+ thiswidth
> precision
))
446 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
447 doc
: /* Return width of STRING when displayed in the current buffer.
448 Width is measured by how many columns it occupies on the screen.
449 When calculating width of a multibyte character in STRING,
450 only the base leading-code is considered; the validity of
451 the following bytes is not checked. Tabs in STRING are always
452 taken to occupy `tab-width' columns. */)
459 XSETFASTINT (val
, lisp_string_width (str
, -1, NULL
, NULL
));
463 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
464 doc
: /* Return the direction of CHAR.
465 The returned value is 0 for left-to-right and 1 for right-to-left. */)
471 CHECK_CHARACTER (ch
);
473 return CHAR_TABLE_REF (Vchar_direction_table
, c
);
476 DEFUN ("chars-in-region", Fchars_in_region
, Schars_in_region
, 2, 2, 0,
477 doc
: /* Return number of characters between BEG and END.
478 This is now an obsolete function. We keep it just for backward compatibility. */)
480 Lisp_Object beg
, end
;
484 CHECK_NUMBER_COERCE_MARKER (beg
);
485 CHECK_NUMBER_COERCE_MARKER (end
);
487 from
= min (XFASTINT (beg
), XFASTINT (end
));
488 to
= max (XFASTINT (beg
), XFASTINT (end
));
490 return make_number (to
- from
);
493 /* Return the number of characters in the NBYTES bytes at PTR.
494 This works by looking at the contents and checking for multibyte
495 sequences while assuming that there's no invalid sequence.
496 However, if the current buffer has enable-multibyte-characters =
497 nil, we treat each byte as a character. */
500 chars_in_text (ptr
, nbytes
)
504 /* current_buffer is null at early stages of Emacs initialization. */
505 if (current_buffer
== 0
506 || NILP (current_buffer
->enable_multibyte_characters
))
509 return multibyte_chars_in_text (ptr
, nbytes
);
512 /* Return the number of characters in the NBYTES bytes at PTR.
513 This works by looking at the contents and checking for multibyte
514 sequences while assuming that there's no invalid sequence. It
515 ignores enable-multibyte-characters. */
518 multibyte_chars_in_text (ptr
, nbytes
)
522 unsigned char *endp
= ptr
+ nbytes
;
527 int len
= MULTIBYTE_LENGTH (ptr
, endp
);
538 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
539 characters and bytes in it, and store them in *NCHARS and *NBYTES
540 respectively. On counting bytes, pay attention to that 8-bit
541 characters not constructing a valid multibyte sequence are
542 represented by 2-byte in a multibyte text. */
545 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
547 int len
, *nchars
, *nbytes
;
549 unsigned char *endp
= str
+ len
;
550 int n
, chars
= 0, bytes
= 0;
552 if (len
>= MAX_MULTIBYTE_LENGTH
)
554 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
555 while (str
< adjusted_endp
)
557 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (str
)) > 0)
558 str
+= n
, bytes
+= n
;
566 if ((n
= MULTIBYTE_LENGTH (str
, endp
)) > 0)
567 str
+= n
, bytes
+= n
;
578 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
579 It actually converts only such 8-bit characters that don't contruct
580 a multibyte sequence to multibyte forms of Latin-1 characters. If
581 NCHARS is nonzero, set *NCHARS to the number of characters in the
582 text. It is assured that we can use LEN bytes at STR as a work
583 area and that is enough. Return the number of bytes of the
587 str_as_multibyte (str
, len
, nbytes
, nchars
)
589 int len
, nbytes
, *nchars
;
591 unsigned char *p
= str
, *endp
= str
+ nbytes
;
596 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
598 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
599 while (p
< adjusted_endp
600 && (n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
603 while ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
613 safe_bcopy ((char *) p
, (char *) (endp
- nbytes
), nbytes
);
616 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
618 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
619 while (p
< adjusted_endp
)
621 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
629 c
= BYTE8_TO_CHAR (c
);
630 to
+= CHAR_STRING (c
, to
);
637 if ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
645 c
= BYTE8_TO_CHAR (c
);
646 to
+= CHAR_STRING (c
, to
);
655 /* Parse unibyte string at STR of LEN bytes, and return the number of
656 bytes it may ocupy when converted to multibyte string by
657 `str_to_multibyte'. */
660 parse_str_to_multibyte (str
, len
)
664 unsigned char *endp
= str
+ len
;
667 for (bytes
= 0; str
< endp
; str
++)
668 bytes
+= (*str
< 0x80) ? 1 : 2;
673 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
674 that contains the same single-byte characters. It actually
675 converts all 8-bit characters to multibyte forms. It is assured
676 that we can use LEN bytes at STR as a work area and that is
680 str_to_multibyte (str
, len
, bytes
)
684 unsigned char *p
= str
, *endp
= str
+ bytes
;
687 while (p
< endp
&& *p
< 0x80) p
++;
693 safe_bcopy ((char *) p
, (char *) (endp
- bytes
), bytes
);
700 c
= BYTE8_TO_CHAR (c
);
701 to
+= CHAR_STRING (c
, to
);
706 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
707 actually converts characters in the range 0x80..0xFF to
711 str_as_unibyte (str
, bytes
)
715 const unsigned char *p
= str
, *endp
= str
+ bytes
;
722 len
= BYTES_BY_CHAR_HEAD (c
);
723 if (CHAR_BYTE8_HEAD_P (c
))
727 to
= str
+ (p
- str
);
731 len
= BYTES_BY_CHAR_HEAD (c
);
732 if (CHAR_BYTE8_HEAD_P (c
))
734 c
= STRING_CHAR_ADVANCE (p
);
735 *to
++ = CHAR_TO_BYTE8 (c
);
739 while (len
--) *to
++ = *p
++;
746 string_count_byte8 (string
)
749 int multibyte
= STRING_MULTIBYTE (string
);
750 int nbytes
= STRING_BYTES (XSTRING (string
));
751 unsigned char *p
= XSTRING (string
)->data
;
752 unsigned char *pend
= p
+ nbytes
;
760 len
= BYTES_BY_CHAR_HEAD (c
);
762 if (CHAR_BYTE8_HEAD_P (c
))
777 string_escape_byte8 (string
)
780 int nchars
= XSTRING (string
)->size
;
781 int nbytes
= STRING_BYTES (XSTRING (string
));
782 int multibyte
= STRING_MULTIBYTE (string
);
784 const unsigned char *src
, *src_end
;
789 if (multibyte
&& nchars
== nbytes
)
792 byte8_count
= string_count_byte8 (string
);
794 if (byte8_count
== 0)
798 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
799 val
= make_uninit_multibyte_string (nchars
+ byte8_count
* 3,
800 nbytes
+ byte8_count
* 2);
802 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
803 val
= make_uninit_string (nbytes
+ byte8_count
* 3);
805 src
= XSTRING (string
)->data
;
806 src_end
= src
+ nbytes
;
807 dst
= XSTRING (val
)->data
;
809 while (src
< src_end
)
812 len
= BYTES_BY_CHAR_HEAD (c
);
814 if (CHAR_BYTE8_HEAD_P (c
))
816 c
= STRING_CHAR_ADVANCE (src
);
817 c
= CHAR_TO_BYTE8 (c
);
818 sprintf ((char *) dst
, "\\%03o", c
);
822 while (len
--) *dst
++ = *src
++;
825 while (src
< src_end
)
830 sprintf ((char *) dst
, "\\%03o", c
);
840 DEFUN ("string", Fstring
, Sstring
, 1, MANY
, 0,
842 Concatenate all the argument characters and make the result a string.
843 usage: (string &rest CHARACTERS) */)
849 unsigned char *buf
= (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH
* n
);
850 unsigned char *p
= buf
;
853 for (i
= 0; i
< n
; i
++)
855 CHECK_CHARACTER (args
[i
]);
857 p
+= CHAR_STRING (c
, p
);
860 return make_string_from_bytes ((char *) buf
, n
, p
- buf
);
864 init_character_once ()
873 DEFSYM (Qcharacterp
, "characterp");
874 DEFSYM (Qauto_fill_chars
, "auto-fill-chars");
876 staticpro (&Vchar_unify_table
);
877 Vchar_unify_table
= Qnil
;
879 defsubr (&Smax_char
);
880 defsubr (&Scharacterp
);
881 defsubr (&Sunibyte_char_to_multibyte
);
882 defsubr (&Smultibyte_char_to_unibyte
);
883 defsubr (&Schar_bytes
);
884 defsubr (&Schar_width
);
885 defsubr (&Sstring_width
);
886 defsubr (&Schar_direction
);
887 defsubr (&Schars_in_region
);
890 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
892 Vector recording all translation tables ever defined.
893 Each element is a pair (SYMBOL . TABLE) relating the table to the
894 symbol naming it. The ID of a translation table is an index into this vector. */);
895 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
897 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
899 A char-table for characters which invoke auto-filling.
900 Such characters have value t in this table. */);
901 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
902 CHAR_TABLE_SET (Vauto_fill_chars
, ' ', Qt
);
903 CHAR_TABLE_SET (Vauto_fill_chars
, '\n', Qt
);
905 DEFVAR_LISP ("char-width-table", &Vchar_width_table
,
907 A char-table for width (columns) of each character. */);
908 Vchar_width_table
= Fmake_char_table (Qnil
, make_number (1));
909 char_table_set_range (Vchar_width_table
, 0x80, 0x9F, make_number (4));
910 char_table_set_range (Vchar_width_table
, MAX_5_BYTE_CHAR
+ 1, MAX_CHAR
,
913 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table
,
914 doc
: /* A char-table for direction of each character. */);
915 Vchar_direction_table
= Fmake_char_table (Qnil
, make_number (1));
917 DEFVAR_LISP ("printable-chars", &Vprintable_chars
,
918 doc
: /* A char-table for each printable character. */);
919 Vprintable_chars
= Fmake_char_table (Qnil
, Qnil
);
921 DEFVAR_LISP ("char-script-table", &Vchar_script_table
,
922 doc
: /* Char table of script symbols.
923 It has one extra slot whose value is a list of script symbols. */);
925 /* Intern this now in case it isn't already done.
926 Setting this variable twice is harmless.
927 But don't staticpro it here--that is done in alloc.c. */
928 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
929 DEFSYM (Qchar_script_table
, "char-script-table");
930 Fput (Qchar_script_table
, Qchar_table_extra_slots
, make_number (1));
931 Vchar_script_table
= Fmake_char_table (Qchar_script_table
, Qnil
);