1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* At first, see the document in `character.h' to understand the code
37 #include <sys/types.h>
39 #include "character.h"
42 #include "composite.h"
51 Lisp_Object Qcharacterp
;
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector
;
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars
;
60 Lisp_Object Qauto_fill_chars
;
62 Lisp_Object Vchar_unify_table
;
64 /* A char-table. An element is non-nil iff the corresponding
65 character has a printable glyph. */
66 Lisp_Object Vprintable_chars
;
68 /* A char-table. An elemnent is a column-width of the corresponding
70 Lisp_Object Vchar_width_table
;
72 /* A char-table. An element is a symbol indicating the direction
73 property of corresponding character. */
74 Lisp_Object Vchar_direction_table
;
76 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
77 unsigned char *_fetch_multibyte_char_p
;
78 int _fetch_multibyte_char_len
;
80 /* Char table of scripts. */
81 Lisp_Object Vchar_script_table
;
83 static Lisp_Object Qchar_script_table
;
89 char_string_with_unification (c
, p
)
97 if (c
<= MAX_3_BYTE_CHAR
|| c
> MAX_5_BYTE_CHAR
)
99 bytes
= CHAR_STRING (c
, p
);
101 else if (c
<= MAX_4_BYTE_CHAR
)
103 p
[0] = (0xF0 | (c
>> 18));
104 p
[1] = (0x80 | ((c
>> 12) & 0x3F));
105 p
[2] = (0x80 | ((c
>> 6) & 0x3F));
106 p
[3] = (0x80 | (c
& 0x3F));
112 p
[1] = (0x80 | ((c
>> 18) & 0x0F));
113 p
[2] = (0x80 | ((c
>> 12) & 0x3F));
114 p
[3] = (0x80 | ((c
>> 6) & 0x3F));
115 p
[4] = (0x80 | (c
& 0x3F));
124 string_char_with_unification (p
, advanced
, len
)
125 const unsigned char *p
;
126 const unsigned char **advanced
;
130 const unsigned char *saved_p
= p
;
132 if (*p
< 0x80 || ! (*p
& 0x20) || ! (*p
& 0x10))
134 c
= STRING_CHAR_ADVANCE (p
);
136 else if (! (*p
& 0x08))
138 c
= ((((p
)[0] & 0xF) << 18)
139 | (((p
)[1] & 0x3F) << 12)
140 | (((p
)[2] & 0x3F) << 6)
146 c
= ((((p
)[1] & 0x3F) << 18)
147 | (((p
)[2] & 0x3F) << 12)
148 | (((p
)[3] & 0x3F) << 6)
153 MAYBE_UNIFY_CHAR (c
);
163 /* Translate character C by translation table TABLE. If C is
164 negative, translate a character specified by CHARSET and CODE. If
165 no translation is found in TABLE, return the untranslated
169 translate_char (table
, c
)
175 if (! CHAR_TABLE_P (table
))
177 ch
= CHAR_TABLE_REF (table
, c
);
178 if (! CHARACTERP (ch
))
183 /* Convert the unibyte character C to the corresponding multibyte
184 character based on the current value of charset_primary. If C
185 can't be converted, return C. */
188 unibyte_char_to_multibyte (c
)
191 struct charset
*charset
= CHARSET_FROM_ID (charset_primary
);
192 int c1
= DECODE_CHAR (charset
, c
);
194 return ((c1
>= 0) ? c1
: c
);
198 /* Convert the multibyte character C to unibyte 8-bit character based
199 on the current value of charset_primary. If dimension of
200 charset_primary is more than one, return (C & 0xFF).
202 The argument REV_TBL is now ignored. It will be removed in the
206 multibyte_char_to_unibyte (c
, rev_tbl
)
210 struct charset
*charset
= CHARSET_FROM_ID (charset_primary
);
211 unsigned c1
= ENCODE_CHAR (charset
, c
);
213 return ((c1
!= CHARSET_INVALID_CODE (charset
)) ? c1
: c
& 0xFF);
217 DEFUN ("characterp", Fcharacterp
, Scharacterp
, 1, 2, 0,
218 doc
: /* Return non-nil if OBJECT is a character. */)
220 Lisp_Object object
, ignore
;
222 return (CHARACTERP (object
) ? Qt
: Qnil
);
225 DEFUN ("max-char", Fmax_char
, Smax_char
, 0, 0, 0,
226 doc
: /* Return the character of the maximum code. */)
229 return make_number (MAX_CHAR
);
232 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
233 Sunibyte_char_to_multibyte
, 1, 1, 0,
234 doc
: /* Convert the unibyte character CH to multibyte character.
235 The multibyte character is a result of decoding CH by
236 the current primary charset (value of `charset-primary'). */)
241 struct charset
*charset
;
243 CHECK_CHARACTER (ch
);
246 error ("Invalid unibyte character: %d", c
);
247 charset
= CHARSET_FROM_ID (charset_primary
);
248 c
= DECODE_CHAR (charset
, c
);
250 c
= BYTE8_TO_CHAR (XFASTINT (ch
));
251 return make_number (c
);
254 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
255 Smultibyte_char_to_unibyte
, 1, 1, 0,
256 doc
: /* Convert the multibyte character CH to unibyte character.\n\
257 The unibyte character is a result of encoding CH by
258 the current primary charset (value of `charset-primary'). */)
263 struct charset
*charset
;
265 CHECK_CHARACTER (ch
);
267 c
= CHAR_TO_BYTE8 (c
);
268 return make_number (c
);
271 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
272 doc
: /* Return 1 regardless of the argument CHAR.
273 This is now an obsolete function. We keep it just for backward compatibility. */)
277 CHECK_CHARACTER (ch
);
278 return make_number (1);
281 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
282 doc
: /* Return width of CHAR when displayed in the current buffer.
283 The width is measured by how many columns it occupies on the screen.
284 Tab is taken to occupy `tab-width' columns. */)
290 struct Lisp_Char_Table
*dp
= buffer_display_table ();
292 CHECK_CHARACTER (ch
);
295 /* Get the way the display table would display it. */
296 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
299 width
= ASIZE (disp
);
301 width
= CHAR_WIDTH (c
);
303 return make_number (width
);
306 /* Return width of string STR of length LEN when displayed in the
307 current buffer. The width is measured by how many columns it
308 occupies on the screen. If PRECISION > 0, return the width of
309 longest substring that doesn't exceed PRECISION, and set number of
310 characters and bytes of the substring in *NCHARS and *NBYTES
314 c_string_width (str
, len
, precision
, nchars
, nbytes
)
316 int precision
, *nchars
, *nbytes
;
318 int i
= 0, i_byte
= 0;
320 struct Lisp_Char_Table
*dp
= buffer_display_table ();
324 int bytes
, thiswidth
;
326 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
330 val
= DISP_CHAR_VECTOR (dp
, c
);
332 thiswidth
= XVECTOR (val
)->size
;
334 thiswidth
= CHAR_WIDTH (c
);
338 thiswidth
= CHAR_WIDTH (c
);
342 && (width
+ thiswidth
> precision
))
362 /* Return width of string STR of length LEN when displayed in the
363 current buffer. The width is measured by how many columns it
364 occupies on the screen. */
371 return c_string_width (str
, len
, -1, NULL
, NULL
);
374 /* Return width of Lisp string STRING when displayed in the current
375 buffer. The width is measured by how many columns it occupies on
376 the screen while paying attention to compositions. If PRECISION >
377 0, return the width of longest substring that doesn't exceed
378 PRECISION, and set number of characters and bytes of the substring
379 in *NCHARS and *NBYTES respectively. */
382 lisp_string_width (string
, precision
, nchars
, nbytes
)
384 int precision
, *nchars
, *nbytes
;
386 int len
= XSTRING (string
)->size
;
387 unsigned char *str
= XSTRING (string
)->data
;
388 int i
= 0, i_byte
= 0;
390 struct Lisp_Char_Table
*dp
= buffer_display_table ();
394 int chars
, bytes
, thiswidth
;
399 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
400 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
403 thiswidth
= composition_table
[cmp_id
]->width
;
405 bytes
= string_char_to_byte (string
, end
) - i_byte
;
409 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
412 val
= DISP_CHAR_VECTOR (dp
, c
);
414 thiswidth
= XVECTOR (val
)->size
;
416 thiswidth
= CHAR_WIDTH (c
);
420 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
423 thiswidth
= CHAR_WIDTH (c
);
427 && (width
+ thiswidth
> precision
))
447 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
448 doc
: /* Return width of STRING when displayed in the current buffer.
449 Width is measured by how many columns it occupies on the screen.
450 When calculating width of a multibyte character in STRING,
451 only the base leading-code is considered; the validity of
452 the following bytes is not checked. Tabs in STRING are always
453 taken to occupy `tab-width' columns. */)
460 XSETFASTINT (val
, lisp_string_width (str
, -1, NULL
, NULL
));
464 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
465 doc
: /* Return the direction of CHAR.
466 The returned value is 0 for left-to-right and 1 for right-to-left. */)
472 CHECK_CHARACTER (ch
);
474 return CHAR_TABLE_REF (Vchar_direction_table
, c
);
477 DEFUN ("chars-in-region", Fchars_in_region
, Schars_in_region
, 2, 2, 0,
478 doc
: /* Return number of characters between BEG and END.
479 This is now an obsolete function. We keep it just for backward compatibility. */)
481 Lisp_Object beg
, end
;
485 CHECK_NUMBER_COERCE_MARKER (beg
);
486 CHECK_NUMBER_COERCE_MARKER (end
);
488 from
= min (XFASTINT (beg
), XFASTINT (end
));
489 to
= max (XFASTINT (beg
), XFASTINT (end
));
491 return make_number (to
- from
);
494 /* Return the number of characters in the NBYTES bytes at PTR.
495 This works by looking at the contents and checking for multibyte
496 sequences while assuming that there's no invalid sequence.
497 However, if the current buffer has enable-multibyte-characters =
498 nil, we treat each byte as a character. */
501 chars_in_text (ptr
, nbytes
)
505 /* current_buffer is null at early stages of Emacs initialization. */
506 if (current_buffer
== 0
507 || NILP (current_buffer
->enable_multibyte_characters
))
510 return multibyte_chars_in_text (ptr
, nbytes
);
513 /* Return the number of characters in the NBYTES bytes at PTR.
514 This works by looking at the contents and checking for multibyte
515 sequences while assuming that there's no invalid sequence. It
516 ignores enable-multibyte-characters. */
519 multibyte_chars_in_text (ptr
, nbytes
)
523 unsigned char *endp
= ptr
+ nbytes
;
528 int len
= MULTIBYTE_LENGTH (ptr
, endp
);
539 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
540 characters and bytes in it, and store them in *NCHARS and *NBYTES
541 respectively. On counting bytes, pay attention to that 8-bit
542 characters not constructing a valid multibyte sequence are
543 represented by 2-byte in a multibyte text. */
546 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
548 int len
, *nchars
, *nbytes
;
550 unsigned char *endp
= str
+ len
;
551 int n
, chars
= 0, bytes
= 0;
553 if (len
>= MAX_MULTIBYTE_LENGTH
)
555 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
556 while (str
< adjusted_endp
)
558 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (str
)) > 0)
559 str
+= n
, bytes
+= n
;
567 if ((n
= MULTIBYTE_LENGTH (str
, endp
)) > 0)
568 str
+= n
, bytes
+= n
;
579 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
580 It actually converts only such 8-bit characters that don't contruct
581 a multibyte sequence to multibyte forms of Latin-1 characters. If
582 NCHARS is nonzero, set *NCHARS to the number of characters in the
583 text. It is assured that we can use LEN bytes at STR as a work
584 area and that is enough. Return the number of bytes of the
588 str_as_multibyte (str
, len
, nbytes
, nchars
)
590 int len
, nbytes
, *nchars
;
592 unsigned char *p
= str
, *endp
= str
+ nbytes
;
597 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
599 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
600 while (p
< adjusted_endp
601 && (n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
604 while ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
614 safe_bcopy ((char *) p
, (char *) (endp
- nbytes
), nbytes
);
617 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
619 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
620 while (p
< adjusted_endp
)
622 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
630 c
= BYTE8_TO_CHAR (c
);
631 to
+= CHAR_STRING (c
, to
);
638 if ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
646 c
= BYTE8_TO_CHAR (c
);
647 to
+= CHAR_STRING (c
, to
);
656 /* Parse unibyte string at STR of LEN bytes, and return the number of
657 bytes it may ocupy when converted to multibyte string by
658 `str_to_multibyte'. */
661 parse_str_to_multibyte (str
, len
)
665 unsigned char *endp
= str
+ len
;
668 for (bytes
= 0; str
< endp
; str
++)
669 bytes
+= (*str
< 0x80) ? 1 : 2;
674 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
675 that contains the same single-byte characters. It actually
676 converts all 8-bit characters to multibyte forms. It is assured
677 that we can use LEN bytes at STR as a work area and that is
681 str_to_multibyte (str
, len
, bytes
)
685 unsigned char *p
= str
, *endp
= str
+ bytes
;
688 while (p
< endp
&& *p
< 0x80) p
++;
694 safe_bcopy ((char *) p
, (char *) (endp
- bytes
), bytes
);
701 c
= BYTE8_TO_CHAR (c
);
702 to
+= CHAR_STRING (c
, to
);
707 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
708 actually converts characters in the range 0x80..0xFF to
712 str_as_unibyte (str
, bytes
)
716 const unsigned char *p
= str
, *endp
= str
+ bytes
;
723 len
= BYTES_BY_CHAR_HEAD (c
);
724 if (CHAR_BYTE8_HEAD_P (c
))
728 to
= str
+ (p
- str
);
732 len
= BYTES_BY_CHAR_HEAD (c
);
733 if (CHAR_BYTE8_HEAD_P (c
))
735 c
= STRING_CHAR_ADVANCE (p
);
736 *to
++ = CHAR_TO_BYTE8 (c
);
740 while (len
--) *to
++ = *p
++;
747 string_count_byte8 (string
)
750 int multibyte
= STRING_MULTIBYTE (string
);
751 int nbytes
= STRING_BYTES (XSTRING (string
));
752 unsigned char *p
= XSTRING (string
)->data
;
753 unsigned char *pend
= p
+ nbytes
;
761 len
= BYTES_BY_CHAR_HEAD (c
);
763 if (CHAR_BYTE8_HEAD_P (c
))
778 string_escape_byte8 (string
)
781 int nchars
= XSTRING (string
)->size
;
782 int nbytes
= STRING_BYTES (XSTRING (string
));
783 int multibyte
= STRING_MULTIBYTE (string
);
785 const unsigned char *src
, *src_end
;
790 if (multibyte
&& nchars
== nbytes
)
793 byte8_count
= string_count_byte8 (string
);
795 if (byte8_count
== 0)
799 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
800 val
= make_uninit_multibyte_string (nchars
+ byte8_count
* 3,
801 nbytes
+ byte8_count
* 2);
803 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
804 val
= make_uninit_string (nbytes
+ byte8_count
* 3);
806 src
= XSTRING (string
)->data
;
807 src_end
= src
+ nbytes
;
808 dst
= XSTRING (val
)->data
;
810 while (src
< src_end
)
813 len
= BYTES_BY_CHAR_HEAD (c
);
815 if (CHAR_BYTE8_HEAD_P (c
))
817 c
= STRING_CHAR_ADVANCE (src
);
818 c
= CHAR_TO_BYTE8 (c
);
819 sprintf ((char *) dst
, "\\%03o", c
);
823 while (len
--) *dst
++ = *src
++;
826 while (src
< src_end
)
831 sprintf ((char *) dst
, "\\%03o", c
);
841 DEFUN ("string", Fstring
, Sstring
, 1, MANY
, 0,
843 Concatenate all the argument characters and make the result a string.
844 usage: (string &rest CHARACTERS) */)
850 unsigned char *buf
= (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH
* n
);
851 unsigned char *p
= buf
;
854 for (i
= 0; i
< n
; i
++)
856 CHECK_CHARACTER (args
[i
]);
858 p
+= CHAR_STRING (c
, p
);
861 return make_string_from_bytes ((char *) buf
, n
, p
- buf
);
865 init_character_once ()
874 DEFSYM (Qcharacterp
, "characterp");
875 DEFSYM (Qauto_fill_chars
, "auto-fill-chars");
877 staticpro (&Vchar_unify_table
);
878 Vchar_unify_table
= Qnil
;
880 defsubr (&Smax_char
);
881 defsubr (&Scharacterp
);
882 defsubr (&Sunibyte_char_to_multibyte
);
883 defsubr (&Smultibyte_char_to_unibyte
);
884 defsubr (&Schar_bytes
);
885 defsubr (&Schar_width
);
886 defsubr (&Sstring_width
);
887 defsubr (&Schar_direction
);
888 defsubr (&Schars_in_region
);
891 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
893 Vector recording all translation tables ever defined.
894 Each element is a pair (SYMBOL . TABLE) relating the table to the
895 symbol naming it. The ID of a translation table is an index into this vector. */);
896 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
898 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
900 A char-table for characters which invoke auto-filling.
901 Such characters have value t in this table. */);
902 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
903 CHAR_TABLE_SET (Vauto_fill_chars
, ' ', Qt
);
904 CHAR_TABLE_SET (Vauto_fill_chars
, '\n', Qt
);
906 DEFVAR_LISP ("char-width-table", &Vchar_width_table
,
908 A char-table for width (columns) of each character. */);
909 Vchar_width_table
= Fmake_char_table (Qnil
, make_number (1));
910 char_table_set_range (Vchar_width_table
, 0x80, 0x9F, make_number (4));
911 char_table_set_range (Vchar_width_table
, MAX_5_BYTE_CHAR
+ 1, MAX_CHAR
,
914 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table
,
915 doc
: /* A char-table for direction of each character. */);
916 Vchar_direction_table
= Fmake_char_table (Qnil
, make_number (1));
918 DEFVAR_LISP ("printable-chars", &Vprintable_chars
,
919 doc
: /* A char-table for each printable character. */);
920 Vprintable_chars
= Fmake_char_table (Qnil
, Qnil
);
922 DEFVAR_LISP ("char-script-table", &Vchar_script_table
,
923 doc
: /* Char table of script symbols.
924 It has one extra slot whose value is a list of script symbols. */);
926 /* Intern this now in case it isn't already done.
927 Setting this variable twice is harmless.
928 But don't staticpro it here--that is done in alloc.c. */
929 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
930 DEFSYM (Qchar_script_table
, "char-script-table");
931 Fput (Qchar_script_table
, Qchar_table_extra_slots
, make_number (1));
932 Vchar_script_table
= Fmake_char_table (Qchar_script_table
, Qnil
);