1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* At first, see the document in `character.h' to understand the code
37 #include <sys/types.h>
39 #include "character.h"
42 #include "composite.h"
51 Lisp_Object Qcharacterp
;
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector
;
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars
;
60 Lisp_Object Qauto_fill_chars
;
62 Lisp_Object Vchar_unify_table
;
64 /* A char-table. An element is non-nil iff the corresponding
65 character has a printable glyph. */
66 Lisp_Object Vprintable_chars
;
68 /* A char-table. An elemnent is a column-width of the corresponding
70 Lisp_Object Vchar_width_table
;
72 /* A char-table. An element is a symbol indicating the direction
73 property of corresponding character. */
74 Lisp_Object Vchar_direction_table
;
76 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
77 unsigned char *_fetch_multibyte_char_p
;
79 /* Char table of scripts. */
80 Lisp_Object Vchar_script_table
;
82 static Lisp_Object Qchar_script_table
;
88 char_string_with_unification (c
, p
)
96 if (c
<= MAX_3_BYTE_CHAR
|| c
> MAX_5_BYTE_CHAR
)
98 bytes
= CHAR_STRING (c
, p
);
100 else if (c
<= MAX_4_BYTE_CHAR
)
102 p
[0] = (0xF0 | (c
>> 18));
103 p
[1] = (0x80 | ((c
>> 12) & 0x3F));
104 p
[2] = (0x80 | ((c
>> 6) & 0x3F));
105 p
[3] = (0x80 | (c
& 0x3F));
111 p
[1] = (0x80 | ((c
>> 18) & 0x0F));
112 p
[2] = (0x80 | ((c
>> 12) & 0x3F));
113 p
[3] = (0x80 | ((c
>> 6) & 0x3F));
114 p
[4] = (0x80 | (c
& 0x3F));
123 string_char_with_unification (p
, advanced
, len
)
124 const unsigned char *p
;
125 const unsigned char **advanced
;
129 const unsigned char *saved_p
= p
;
131 if (*p
< 0x80 || ! (*p
& 0x20) || ! (*p
& 0x10))
133 c
= STRING_CHAR_ADVANCE (p
);
135 else if (! (*p
& 0x08))
137 c
= ((((p
)[0] & 0xF) << 18)
138 | (((p
)[1] & 0x3F) << 12)
139 | (((p
)[2] & 0x3F) << 6)
145 c
= ((((p
)[1] & 0x3F) << 18)
146 | (((p
)[2] & 0x3F) << 12)
147 | (((p
)[3] & 0x3F) << 6)
152 MAYBE_UNIFY_CHAR (c
);
162 /* Translate character C by translation table TABLE. If C is
163 negative, translate a character specified by CHARSET and CODE. If
164 no translation is found in TABLE, return the untranslated
168 translate_char (table
, c
)
174 if (! CHAR_TABLE_P (table
))
176 ch
= CHAR_TABLE_REF (table
, c
);
177 if (! CHARACTERP (ch
))
182 /* Convert the unibyte character C to the corresponding multibyte
183 character based on the current value of charset_unibyte. If C
184 can't be converted, return C. */
187 unibyte_char_to_multibyte (c
)
190 struct charset
*charset
= CHARSET_FROM_ID (charset_unibyte
);
191 int c1
= DECODE_CHAR (charset
, c
);
193 return ((c1
>= 0) ? c1
: c
);
197 /* Convert the multibyte character C to unibyte 8-bit character based
198 on the current value of charset_unibyte. If dimension of
199 charset_unibyte is more than one, return (C & 0xFF).
201 The argument REV_TBL is now ignored. It will be removed in the
205 multibyte_char_to_unibyte (c
, rev_tbl
)
209 struct charset
*charset
= CHARSET_FROM_ID (charset_unibyte
);
210 unsigned c1
= ENCODE_CHAR (charset
, c
);
212 return ((c1
!= CHARSET_INVALID_CODE (charset
)) ? c1
: c
& 0xFF);
216 DEFUN ("characterp", Fcharacterp
, Scharacterp
, 1, 2, 0,
217 doc
: /* Return non-nil if OBJECT is a character. */)
219 Lisp_Object object
, ignore
;
221 return (CHARACTERP (object
) ? Qt
: Qnil
);
224 DEFUN ("max-char", Fmax_char
, Smax_char
, 0, 0, 0,
225 doc
: /* Return the character of the maximum code. */)
228 return make_number (MAX_CHAR
);
231 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
232 Sunibyte_char_to_multibyte
, 1, 1, 0,
233 doc
: /* Convert the unibyte character CH to multibyte character.
234 The multibyte character is a result of decoding CH by
235 the current unibyte charset (see `unibyte-charset'). */)
240 struct charset
*charset
;
242 CHECK_CHARACTER (ch
);
245 error ("Invalid unibyte character: %d", c
);
246 charset
= CHARSET_FROM_ID (charset_unibyte
);
247 c
= DECODE_CHAR (charset
, c
);
249 c
= BYTE8_TO_CHAR (XFASTINT (ch
));
250 return make_number (c
);
253 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
254 Smultibyte_char_to_unibyte
, 1, 1, 0,
255 doc
: /* Convert the multibyte character CH to unibyte character.\n\
256 The unibyte character is a result of encoding CH by
257 the current primary charset (value of `charset-primary'). */)
263 CHECK_CHARACTER (ch
);
265 c
= CHAR_TO_BYTE8 (c
);
266 return make_number (c
);
269 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
270 doc
: /* Return 1 regardless of the argument CHAR.
271 This is now an obsolete function. We keep it just for backward compatibility. */)
275 CHECK_CHARACTER (ch
);
276 return make_number (1);
279 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
280 doc
: /* Return width of CHAR when displayed in the current buffer.
281 The width is measured by how many columns it occupies on the screen.
282 Tab is taken to occupy `tab-width' columns. */)
288 struct Lisp_Char_Table
*dp
= buffer_display_table ();
290 CHECK_CHARACTER (ch
);
293 /* Get the way the display table would display it. */
294 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
297 width
= ASIZE (disp
);
299 width
= CHAR_WIDTH (c
);
301 return make_number (width
);
304 /* Return width of string STR of length LEN when displayed in the
305 current buffer. The width is measured by how many columns it
306 occupies on the screen. If PRECISION > 0, return the width of
307 longest substring that doesn't exceed PRECISION, and set number of
308 characters and bytes of the substring in *NCHARS and *NBYTES
312 c_string_width (str
, len
, precision
, nchars
, nbytes
)
314 int precision
, *nchars
, *nbytes
;
316 int i
= 0, i_byte
= 0;
318 struct Lisp_Char_Table
*dp
= buffer_display_table ();
322 int bytes
, thiswidth
;
324 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
328 val
= DISP_CHAR_VECTOR (dp
, c
);
330 thiswidth
= XVECTOR (val
)->size
;
332 thiswidth
= CHAR_WIDTH (c
);
336 thiswidth
= CHAR_WIDTH (c
);
340 && (width
+ thiswidth
> precision
))
360 /* Return width of string STR of length LEN when displayed in the
361 current buffer. The width is measured by how many columns it
362 occupies on the screen. */
369 return c_string_width (str
, len
, -1, NULL
, NULL
);
372 /* Return width of Lisp string STRING when displayed in the current
373 buffer. The width is measured by how many columns it occupies on
374 the screen while paying attention to compositions. If PRECISION >
375 0, return the width of longest substring that doesn't exceed
376 PRECISION, and set number of characters and bytes of the substring
377 in *NCHARS and *NBYTES respectively. */
380 lisp_string_width (string
, precision
, nchars
, nbytes
)
382 int precision
, *nchars
, *nbytes
;
384 int len
= XSTRING (string
)->size
;
385 unsigned char *str
= XSTRING (string
)->data
;
386 int i
= 0, i_byte
= 0;
388 struct Lisp_Char_Table
*dp
= buffer_display_table ();
392 int chars
, bytes
, thiswidth
;
397 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
398 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
401 thiswidth
= composition_table
[cmp_id
]->width
;
403 bytes
= string_char_to_byte (string
, end
) - i_byte
;
407 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
410 val
= DISP_CHAR_VECTOR (dp
, c
);
412 thiswidth
= XVECTOR (val
)->size
;
414 thiswidth
= CHAR_WIDTH (c
);
418 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
421 thiswidth
= CHAR_WIDTH (c
);
425 && (width
+ thiswidth
> precision
))
445 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
446 doc
: /* Return width of STRING when displayed in the current buffer.
447 Width is measured by how many columns it occupies on the screen.
448 When calculating width of a multibyte character in STRING,
449 only the base leading-code is considered; the validity of
450 the following bytes is not checked. Tabs in STRING are always
451 taken to occupy `tab-width' columns. */)
458 XSETFASTINT (val
, lisp_string_width (str
, -1, NULL
, NULL
));
462 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
463 doc
: /* Return the direction of CHAR.
464 The returned value is 0 for left-to-right and 1 for right-to-left. */)
470 CHECK_CHARACTER (ch
);
472 return CHAR_TABLE_REF (Vchar_direction_table
, c
);
475 DEFUN ("chars-in-region", Fchars_in_region
, Schars_in_region
, 2, 2, 0,
476 doc
: /* Return number of characters between BEG and END.
477 This is now an obsolete function. We keep it just for backward compatibility. */)
479 Lisp_Object beg
, end
;
483 CHECK_NUMBER_COERCE_MARKER (beg
);
484 CHECK_NUMBER_COERCE_MARKER (end
);
486 from
= min (XFASTINT (beg
), XFASTINT (end
));
487 to
= max (XFASTINT (beg
), XFASTINT (end
));
489 return make_number (to
- from
);
492 /* Return the number of characters in the NBYTES bytes at PTR.
493 This works by looking at the contents and checking for multibyte
494 sequences while assuming that there's no invalid sequence.
495 However, if the current buffer has enable-multibyte-characters =
496 nil, we treat each byte as a character. */
499 chars_in_text (ptr
, nbytes
)
503 /* current_buffer is null at early stages of Emacs initialization. */
504 if (current_buffer
== 0
505 || NILP (current_buffer
->enable_multibyte_characters
))
508 return multibyte_chars_in_text (ptr
, nbytes
);
511 /* Return the number of characters in the NBYTES bytes at PTR.
512 This works by looking at the contents and checking for multibyte
513 sequences while assuming that there's no invalid sequence. It
514 ignores enable-multibyte-characters. */
517 multibyte_chars_in_text (ptr
, nbytes
)
521 unsigned char *endp
= ptr
+ nbytes
;
526 int len
= MULTIBYTE_LENGTH (ptr
, endp
);
537 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
538 characters and bytes in it, and store them in *NCHARS and *NBYTES
539 respectively. On counting bytes, pay attention to that 8-bit
540 characters not constructing a valid multibyte sequence are
541 represented by 2-byte in a multibyte text. */
544 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
546 int len
, *nchars
, *nbytes
;
548 unsigned char *endp
= str
+ len
;
549 int n
, chars
= 0, bytes
= 0;
551 if (len
>= MAX_MULTIBYTE_LENGTH
)
553 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
554 while (str
< adjusted_endp
)
556 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (str
)) > 0)
557 str
+= n
, bytes
+= n
;
565 if ((n
= MULTIBYTE_LENGTH (str
, endp
)) > 0)
566 str
+= n
, bytes
+= n
;
577 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
578 It actually converts only such 8-bit characters that don't contruct
579 a multibyte sequence to multibyte forms of Latin-1 characters. If
580 NCHARS is nonzero, set *NCHARS to the number of characters in the
581 text. It is assured that we can use LEN bytes at STR as a work
582 area and that is enough. Return the number of bytes of the
586 str_as_multibyte (str
, len
, nbytes
, nchars
)
588 int len
, nbytes
, *nchars
;
590 unsigned char *p
= str
, *endp
= str
+ nbytes
;
595 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
597 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
598 while (p
< adjusted_endp
599 && (n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
602 while ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
612 safe_bcopy ((char *) p
, (char *) (endp
- nbytes
), nbytes
);
615 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
617 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
618 while (p
< adjusted_endp
)
620 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
628 c
= BYTE8_TO_CHAR (c
);
629 to
+= CHAR_STRING (c
, to
);
636 if ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
644 c
= BYTE8_TO_CHAR (c
);
645 to
+= CHAR_STRING (c
, to
);
654 /* Parse unibyte string at STR of LEN bytes, and return the number of
655 bytes it may ocupy when converted to multibyte string by
656 `str_to_multibyte'. */
659 parse_str_to_multibyte (str
, len
)
663 unsigned char *endp
= str
+ len
;
666 for (bytes
= 0; str
< endp
; str
++)
667 bytes
+= (*str
< 0x80) ? 1 : 2;
672 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
673 that contains the same single-byte characters. It actually
674 converts all 8-bit characters to multibyte forms. It is assured
675 that we can use LEN bytes at STR as a work area and that is
679 str_to_multibyte (str
, len
, bytes
)
683 unsigned char *p
= str
, *endp
= str
+ bytes
;
686 while (p
< endp
&& *p
< 0x80) p
++;
692 safe_bcopy ((char *) p
, (char *) (endp
- bytes
), bytes
);
699 c
= BYTE8_TO_CHAR (c
);
700 to
+= CHAR_STRING (c
, to
);
705 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
706 actually converts characters in the range 0x80..0xFF to
710 str_as_unibyte (str
, bytes
)
714 const unsigned char *p
= str
, *endp
= str
+ bytes
;
721 len
= BYTES_BY_CHAR_HEAD (c
);
722 if (CHAR_BYTE8_HEAD_P (c
))
726 to
= str
+ (p
- str
);
730 len
= BYTES_BY_CHAR_HEAD (c
);
731 if (CHAR_BYTE8_HEAD_P (c
))
733 c
= STRING_CHAR_ADVANCE (p
);
734 *to
++ = CHAR_TO_BYTE8 (c
);
738 while (len
--) *to
++ = *p
++;
745 string_count_byte8 (string
)
748 int multibyte
= STRING_MULTIBYTE (string
);
749 int nbytes
= STRING_BYTES (XSTRING (string
));
750 unsigned char *p
= XSTRING (string
)->data
;
751 unsigned char *pend
= p
+ nbytes
;
759 len
= BYTES_BY_CHAR_HEAD (c
);
761 if (CHAR_BYTE8_HEAD_P (c
))
776 string_escape_byte8 (string
)
779 int nchars
= XSTRING (string
)->size
;
780 int nbytes
= STRING_BYTES (XSTRING (string
));
781 int multibyte
= STRING_MULTIBYTE (string
);
783 const unsigned char *src
, *src_end
;
788 if (multibyte
&& nchars
== nbytes
)
791 byte8_count
= string_count_byte8 (string
);
793 if (byte8_count
== 0)
797 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
798 val
= make_uninit_multibyte_string (nchars
+ byte8_count
* 3,
799 nbytes
+ byte8_count
* 2);
801 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
802 val
= make_uninit_string (nbytes
+ byte8_count
* 3);
804 src
= XSTRING (string
)->data
;
805 src_end
= src
+ nbytes
;
806 dst
= XSTRING (val
)->data
;
808 while (src
< src_end
)
811 len
= BYTES_BY_CHAR_HEAD (c
);
813 if (CHAR_BYTE8_HEAD_P (c
))
815 c
= STRING_CHAR_ADVANCE (src
);
816 c
= CHAR_TO_BYTE8 (c
);
817 sprintf ((char *) dst
, "\\%03o", c
);
821 while (len
--) *dst
++ = *src
++;
824 while (src
< src_end
)
829 sprintf ((char *) dst
, "\\%03o", c
);
839 DEFUN ("string", Fstring
, Sstring
, 1, MANY
, 0,
841 Concatenate all the argument characters and make the result a string.
842 usage: (string &rest CHARACTERS) */)
848 unsigned char *buf
= (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH
* n
);
849 unsigned char *p
= buf
;
852 for (i
= 0; i
< n
; i
++)
854 CHECK_CHARACTER (args
[i
]);
856 p
+= CHAR_STRING (c
, p
);
859 return make_string_from_bytes ((char *) buf
, n
, p
- buf
);
863 init_character_once ()
872 DEFSYM (Qcharacterp
, "characterp");
873 DEFSYM (Qauto_fill_chars
, "auto-fill-chars");
875 staticpro (&Vchar_unify_table
);
876 Vchar_unify_table
= Qnil
;
878 defsubr (&Smax_char
);
879 defsubr (&Scharacterp
);
880 defsubr (&Sunibyte_char_to_multibyte
);
881 defsubr (&Smultibyte_char_to_unibyte
);
882 defsubr (&Schar_bytes
);
883 defsubr (&Schar_width
);
884 defsubr (&Sstring_width
);
885 defsubr (&Schar_direction
);
886 defsubr (&Schars_in_region
);
889 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
891 Vector recording all translation tables ever defined.
892 Each element is a pair (SYMBOL . TABLE) relating the table to the
893 symbol naming it. The ID of a translation table is an index into this vector. */);
894 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
896 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
898 A char-table for characters which invoke auto-filling.
899 Such characters have value t in this table. */);
900 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
901 CHAR_TABLE_SET (Vauto_fill_chars
, ' ', Qt
);
902 CHAR_TABLE_SET (Vauto_fill_chars
, '\n', Qt
);
904 DEFVAR_LISP ("char-width-table", &Vchar_width_table
,
906 A char-table for width (columns) of each character. */);
907 Vchar_width_table
= Fmake_char_table (Qnil
, make_number (1));
908 char_table_set_range (Vchar_width_table
, 0x80, 0x9F, make_number (4));
909 char_table_set_range (Vchar_width_table
, MAX_5_BYTE_CHAR
+ 1, MAX_CHAR
,
912 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table
,
913 doc
: /* A char-table for direction of each character. */);
914 Vchar_direction_table
= Fmake_char_table (Qnil
, make_number (1));
916 DEFVAR_LISP ("printable-chars", &Vprintable_chars
,
917 doc
: /* A char-table for each printable character. */);
918 Vprintable_chars
= Fmake_char_table (Qnil
, Qnil
);
920 DEFVAR_LISP ("char-script-table", &Vchar_script_table
,
921 doc
: /* Char table of script symbols.
922 It has one extra slot whose value is a list of script symbols. */);
924 /* Intern this now in case it isn't already done.
925 Setting this variable twice is harmless.
926 But don't staticpro it here--that is done in alloc.c. */
927 Qchar_table_extra_slots
= intern ("char-table-extra-slots");
928 DEFSYM (Qchar_script_table
, "char-script-table");
929 Fput (Qchar_script_table
, Qchar_table_extra_slots
, make_number (1));
930 Vchar_script_table
= Fmake_char_table (Qchar_script_table
, Qnil
);