]> code.delx.au - gnu-emacs/blob - src/charset.c
(split_non_ascii_string): Define return value.
[gnu-emacs] / src / charset.c
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25 #ifdef emacs
26 #include <config.h>
27 #endif
28
29 #include <stdio.h>
30
31 #ifdef emacs
32
33 #include <sys/types.h>
34 #include "lisp.h"
35 #include "buffer.h"
36 #include "charset.h"
37 #include "coding.h"
38 #include "disptab.h"
39
40 #else /* not emacs */
41
42 #include "mulelib.h"
43
44 #endif /* emacs */
45
46 Lisp_Object Qcharset, Qascii, Qcomposition;
47 Lisp_Object Qunknown;
48
49 /* Declaration of special leading-codes. */
50 int leading_code_composition; /* for composite characters */
51 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
52 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
53 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
54 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
55
56 /* Declaration of special charsets. */
57 int charset_ascii; /* ASCII */
58 int charset_composition; /* for a composite character */
59 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
60 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
61 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
62 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
63 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
64 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
65 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
66
67 int min_composite_char;
68
69 Lisp_Object Qcharset_table;
70
71 /* A char-table containing information of each character set. */
72 Lisp_Object Vcharset_table;
73
74 /* A vector of charset symbol indexed by charset-id. This is used
75 only for returning charset symbol from C functions. */
76 Lisp_Object Vcharset_symbol_table;
77
78 /* A list of charset symbols ever defined. */
79 Lisp_Object Vcharset_list;
80
81 /* Vector of translation table ever defined.
82 ID of a translation table is used to index this vector. */
83 Lisp_Object Vtranslation_table_vector;
84
85 /* A char-table for characters which may invoke auto-filling. */
86 Lisp_Object Vauto_fill_chars;
87
88 Lisp_Object Qauto_fill_chars;
89
90 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
91 int bytes_by_char_head[256];
92 int width_by_char_head[256];
93
94 /* Mapping table from ISO2022's charset (specified by DIMENSION,
95 CHARS, and FINAL-CHAR) to Emacs' charset. */
96 int iso_charset_table[2][2][128];
97
98 /* Table of pointers to the structure `cmpchar_info' indexed by
99 CMPCHAR-ID. */
100 struct cmpchar_info **cmpchar_table;
101 /* The current size of `cmpchar_table'. */
102 static int cmpchar_table_size;
103 /* Number of the current composite characters. */
104 int n_cmpchars;
105
106 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
107 unsigned char *_fetch_multibyte_char_p;
108 int _fetch_multibyte_char_len;
109
110 /* Offset to add to a non-ASCII value when inserting it. */
111 int nonascii_insert_offset;
112
113 /* Translation table for converting non-ASCII unibyte characters
114 to multibyte codes, or nil. */
115 Lisp_Object Vnonascii_translation_table;
116
117 /* List of all possible generic characters. */
118 Lisp_Object Vgeneric_character_list;
119
120 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
121 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
122 \f
123 void
124 invalid_character (c)
125 int c;
126 {
127 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
128 }
129
130 /* Parse composite character string STR of length LENGTH (>= 2) and
131 set BYTES, CHARSET, C1, and C2 as below.
132
133 It is assumed that *STR is LEADING_CODE_COMPOSITION and the
134 following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.
135
136 If there is a valid composite character, set CHARSET, C1, and C2 to
137 such values that MAKE_CHAR can make the composite character from
138 them. Otherwise, set CHARSET to CHARSET_COMPOSITION, set C1 to the
139 second byte of the sequence, C2 to -1 so that MAKE_CHAR can make
140 the invalid multibyte character whose string representation is two
141 bytes of STR[0] and STR[1]. In any case, set BYTES to LENGTH.
142
143 This macro should be called only from SPLIT_MULTIBYTE_SEQ. */
144
145 #define SPLIT_COMPOSITE_SEQ(str, length, bytes, charset, c1, c2) \
146 do { \
147 int cmpchar_id = str_cmpchar_id ((str), (length)); \
148 \
149 (charset) = CHARSET_COMPOSITION; \
150 (bytes) = (length); \
151 if (cmpchar_id >= 0) \
152 { \
153 (c1) = CHAR_FIELD2 (cmpchar_id); \
154 (c2) = CHAR_FIELD3 (cmpchar_id); \
155 } \
156 else \
157 { \
158 (c1) = (str)[1] & 0x7F; \
159 (c2) = -1; \
160 } \
161 } while (0)
162
163 /* Parse non-composite multibyte character string STR of length LENGTH
164 (>= 2) and set BYTES to the length of actual multibyte sequence,
165 CHARSET, C1, and C2 to such values that MAKE_CHAR can make the
166 multibyte character from them.
167
168 It is assumed that *STR is one of base leading codes (excluding
169 LEADING_CODE_COMPOSITION) and the following (LENGTH - 1) bytes
170 satisfy !CHAR_HEAD_P.
171
172 This macro should be called only from SPLIT_MULTIBYTE_SEQ. */
173
174 #define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2) \
175 do { \
176 (bytes) = 1; \
177 (charset) = (str)[0]; \
178 if ((charset) >= LEADING_CODE_PRIVATE_11 \
179 && (charset) <= LEADING_CODE_PRIVATE_22) \
180 (charset) = (str)[(bytes)++]; \
181 if ((bytes) < (length)) \
182 { \
183 (c1) = (str)[(bytes)++] & 0x7F; \
184 if ((bytes) < (length)) \
185 (c2) = (str)[(bytes)++] & 0x7F; \
186 else \
187 (c2) = -1; \
188 } \
189 else \
190 (c1) = (c2) = -1; \
191 } while (0)
192
193 /* Parse string STR of length LENGTH and check if a multibyte
194 characters is at STR. set BYTES to the actual length, CHARSET, C1,
195 C2 to proper values for that character. */
196
197 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
198 do { \
199 int i; \
200 if (ASCII_BYTE_P ((str)[0])) \
201 i = 1; \
202 else \
203 for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++); \
204 if (i == 1) \
205 (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ; \
206 else if ((str)[0] == LEADING_CODE_COMPOSITION) \
207 SPLIT_COMPOSITE_SEQ (str, i, bytes, charset, c1, c2); \
208 else \
209 { \
210 if (i > BYTES_BY_CHAR_HEAD ((str)[0])) \
211 i = BYTES_BY_CHAR_HEAD ((str)[0]); \
212 SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2); \
213 } \
214 } while (0)
215
216 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
217 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
218 (charset == CHARSET_ASCII \
219 ? ((c1) >= 0 && (c1) <= 0x7F) \
220 : (CHARSET_DIMENSION (charset) == 1 \
221 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
222 : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F)))
223
224 /* Set STR a pointer to the multi-byte form of the character C. If C
225 is not a composite character, the multi-byte form is set in WORKBUF
226 and STR points WORKBUF. The caller should allocate at least 4-byte
227 area at WORKBUF in advance. Returns the length of the multi-byte
228 form. If C is an invalid character, store (C & 0xFF) in WORKBUF[0]
229 and return 1.
230
231 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
232 function directly if C can be an ASCII character. */
233
234 int
235 non_ascii_char_to_string (c, workbuf, str)
236 int c;
237 unsigned char *workbuf, **str;
238 {
239 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
240 {
241 /* Multibyte character can't have a modifier bit. */
242 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
243 invalid_character (c);
244
245 /* For Meta, Shift, and Control modifiers, we need special care. */
246 if (c & CHAR_META)
247 {
248 /* Move the meta bit to the right place for a string. */
249 c = (c & ~CHAR_META) | 0x80;
250 }
251 if (c & CHAR_SHIFT)
252 {
253 /* Shift modifier is valid only with [A-Za-z]. */
254 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
255 c &= ~CHAR_SHIFT;
256 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
257 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
258 }
259 if (c & CHAR_CTL)
260 {
261 /* Simulate the code in lread.c. */
262 /* Allow `\C- ' and `\C-?'. */
263 if (c == (CHAR_CTL | ' '))
264 c = 0;
265 else if (c == (CHAR_CTL | '?'))
266 c = 127;
267 /* ASCII control chars are made from letters (both cases),
268 as well as the non-letters within 0100...0137. */
269 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
270 c &= (037 | (~0177 & ~CHAR_CTL));
271 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
272 c &= (037 | (~0177 & ~CHAR_CTL));
273 }
274
275 /* If C still has any modifier bits, it is an invalid character. */
276 if (c & CHAR_MODIFIER_MASK)
277 invalid_character (c);
278
279 *str = workbuf;
280 *workbuf++ = c;
281 }
282 else
283 {
284 int charset, c1, c2;
285
286 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
287 if (charset == CHARSET_COMPOSITION)
288 {
289 if (c >= MAX_CHAR)
290 invalid_character (c);
291 if (c >= MIN_CHAR_COMPOSITION)
292 {
293 /* Valid composite character. */
294 *str = cmpchar_table[COMPOSITE_CHAR_ID (c)]->data;
295 workbuf = *str + cmpchar_table[COMPOSITE_CHAR_ID (c)]->len;
296 }
297 else
298 {
299 /* Invalid but can have multibyte form. */
300 *str = workbuf;
301 *workbuf++ = LEADING_CODE_COMPOSITION;
302 *workbuf++ = c1 | 0x80;
303 }
304 }
305 else if (charset > CHARSET_COMPOSITION)
306 {
307 *str = workbuf;
308 if (charset >= LEADING_CODE_EXT_11)
309 *workbuf++ = (charset < LEADING_CODE_EXT_12
310 ? LEADING_CODE_PRIVATE_11
311 : (charset < LEADING_CODE_EXT_21
312 ? LEADING_CODE_PRIVATE_12
313 : (charset < LEADING_CODE_EXT_22
314 ? LEADING_CODE_PRIVATE_21
315 : LEADING_CODE_PRIVATE_22)));
316 *workbuf++ = charset;
317 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
318 invalid_character (c);
319 if (c1)
320 {
321 *workbuf++ = c1 | 0x80;
322 if (c2 > 0)
323 *workbuf++ = c2 | 0x80;
324 }
325 }
326 else if (charset == CHARSET_ASCII)
327 *workbuf++= c & 0x7F;
328 else
329 invalid_character (c);
330 }
331
332 return (workbuf - *str);
333 }
334
335 /* Return the non-ASCII character corresponding to multi-byte form at
336 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
337 length of the multibyte form in *ACTUAL_LEN.
338
339 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
340 directly if you want ot handle ASCII characters as well. */
341
342 int
343 string_to_non_ascii_char (str, len, actual_len)
344 const unsigned char *str;
345 int len, *actual_len;
346 {
347 int c, bytes, charset, c1, c2;
348
349 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
350 c = MAKE_CHAR (charset, c1, c2);
351 if (actual_len)
352 *actual_len = bytes;
353 return c;
354 }
355
356 /* Return the length of the multi-byte form at string STR of length LEN.
357 Use the macro MULTIBYTE_FORM_LENGTH instead. */
358 int
359 multibyte_form_length (str, len)
360 const unsigned char *str;
361 int len;
362 {
363 int bytes;
364
365 PARSE_MULTIBYTE_SEQ (str, len, bytes);
366 return bytes;
367 }
368
369 /* Check multibyte form at string STR of length LEN and set variables
370 pointed by CHARSET, C1, and C2 to charset and position codes of the
371 character at STR, and return 0. If there's no multibyte character,
372 return -1. This should be used only in the macro SPLIT_STRING
373 which checks range of STR in advance. */
374
375 int
376 split_non_ascii_string (str, len, charset, c1, c2)
377 const unsigned char *str;
378 unsigned char *c1, *c2;
379 int len, *charset;
380 {
381 register int bytes, cs, code1, code2 = -1;
382
383 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
384 if (cs == CHARSET_ASCII)
385 return -1;
386 *charset = cs;
387 *c1 = code1;
388 *c2 = code2;
389 return 0;
390 }
391
392 /* Return 1 iff character C has valid printable glyph.
393 Use the macro CHAR_PRINTABLE_P instead. */
394 int
395 char_printable_p (c)
396 int c;
397 {
398 int charset, c1, c2, chars;
399
400 if (SINGLE_BYTE_CHAR_P (c))
401 return 1;
402 if (c >= MIN_CHAR_COMPOSITION)
403 return (c < MAX_CHAR);
404
405 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
406 if (! CHARSET_DEFINED_P (charset))
407 return 0;
408 if (CHARSET_CHARS (charset) == 94
409 ? c1 <= 32 || c1 >= 127
410 : c1 < 32)
411 return 0;
412 if (CHARSET_DIMENSION (charset) == 2
413 && (CHARSET_CHARS (charset) == 94
414 ? c2 <= 32 || c2 >= 127
415 : c2 < 32))
416 return 0;
417 return 1;
418 }
419
420 /* Translate character C by translation table TABLE. If C
421 is negative, translate a character specified by CHARSET, C1, and C2
422 (C1 and C2 are code points of the character). If no translation is
423 found in TABLE, return C. */
424 int
425 translate_char (table, c, charset, c1, c2)
426 Lisp_Object table;
427 int c, charset, c1, c2;
428 {
429 Lisp_Object ch;
430 int alt_charset, alt_c1, alt_c2, dimension;
431
432 if (c < 0) c = MAKE_CHAR (charset, (c1 & 0x7F) , (c2 & 0x7F));
433 if (!CHAR_TABLE_P (table)
434 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
435 return c;
436
437 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
438 dimension = CHARSET_DIMENSION (alt_charset);
439 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
440 /* CH is not a generic character, just return it. */
441 return XFASTINT (ch);
442
443 /* Since CH is a generic character, we must return a specific
444 charater which has the same position codes as C from CH. */
445 if (charset < 0)
446 SPLIT_CHAR (c, charset, c1, c2);
447 if (dimension != CHARSET_DIMENSION (charset))
448 /* We can't make such a character because of dimension mismatch. */
449 return c;
450 return MAKE_CHAR (alt_charset, c1, c2);
451 }
452
453 /* Convert the unibyte character C to multibyte based on
454 Vnonascii_translation_table or nonascii_insert_offset. If they can't
455 convert C to a valid multibyte character, convert it based on
456 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
457
458 int
459 unibyte_char_to_multibyte (c)
460 int c;
461 {
462 if (c < 0400 && c >= 0200)
463 {
464 int c_save = c;
465
466 if (! NILP (Vnonascii_translation_table))
467 {
468 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
469 if (c >= 0400 && ! char_valid_p (c, 0))
470 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
471 }
472 else if (c >= 0240 && nonascii_insert_offset > 0)
473 {
474 c += nonascii_insert_offset;
475 if (c < 0400 || ! char_valid_p (c, 0))
476 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
477 }
478 else if (c >= 0240)
479 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
480 }
481 return c;
482 }
483
484
485 /* Convert the multibyte character C to unibyte 8-bit character based
486 on Vnonascii_translation_table or nonascii_insert_offset. If
487 REV_TBL is non-nil, it should be a reverse table of
488 Vnonascii_translation_table, i.e. what given by:
489 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
490
491 int
492 multibyte_char_to_unibyte (c, rev_tbl)
493 int c;
494 Lisp_Object rev_tbl;
495 {
496 if (!SINGLE_BYTE_CHAR_P (c))
497 {
498 int c_save = c;
499
500 if (! CHAR_TABLE_P (rev_tbl)
501 && CHAR_TABLE_P (Vnonascii_translation_table))
502 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
503 make_number (0));
504 if (CHAR_TABLE_P (rev_tbl))
505 {
506 Lisp_Object temp;
507 temp = Faref (rev_tbl, make_number (c));
508 if (INTEGERP (temp))
509 c = XINT (temp);
510 if (c >= 256)
511 c = (c_save & 0177) + 0200;
512 }
513 else
514 {
515 if (nonascii_insert_offset > 0)
516 c -= nonascii_insert_offset;
517 if (c < 128 || c >= 256)
518 c = (c_save & 0177) + 0200;
519 }
520 }
521
522 return c;
523 }
524
525 \f
526 /* Update the table Vcharset_table with the given arguments (see the
527 document of `define-charset' for the meaning of each argument).
528 Several other table contents are also updated. The caller should
529 check the validity of CHARSET-ID and the remaining arguments in
530 advance. */
531
532 void
533 update_charset_table (charset_id, dimension, chars, width, direction,
534 iso_final_char, iso_graphic_plane,
535 short_name, long_name, description)
536 Lisp_Object charset_id, dimension, chars, width, direction;
537 Lisp_Object iso_final_char, iso_graphic_plane;
538 Lisp_Object short_name, long_name, description;
539 {
540 int charset = XINT (charset_id);
541 int bytes;
542 unsigned char leading_code_base, leading_code_ext;
543
544 if (NILP (CHARSET_TABLE_ENTRY (charset)))
545 CHARSET_TABLE_ENTRY (charset)
546 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
547
548 /* Get byte length of multibyte form, base leading-code, and
549 extended leading-code of the charset. See the comment under the
550 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
551 bytes = XINT (dimension);
552 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
553 {
554 /* Official charset, it doesn't have an extended leading-code. */
555 if (charset != CHARSET_ASCII)
556 bytes += 1; /* For a base leading-code. */
557 leading_code_base = charset;
558 leading_code_ext = 0;
559 }
560 else
561 {
562 /* Private charset. */
563 bytes += 2; /* For base and extended leading-codes. */
564 leading_code_base
565 = (charset < LEADING_CODE_EXT_12
566 ? LEADING_CODE_PRIVATE_11
567 : (charset < LEADING_CODE_EXT_21
568 ? LEADING_CODE_PRIVATE_12
569 : (charset < LEADING_CODE_EXT_22
570 ? LEADING_CODE_PRIVATE_21
571 : LEADING_CODE_PRIVATE_22)));
572 leading_code_ext = charset;
573 }
574
575 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
576 error ("Invalid dimension for the charset-ID %d", charset);
577
578 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
579 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
580 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
581 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
582 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
583 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
584 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
585 = make_number (leading_code_base);
586 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
587 = make_number (leading_code_ext);
588 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
589 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
590 = iso_graphic_plane;
591 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
592 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
593 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
594 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
595
596 {
597 /* If we have already defined a charset which has the same
598 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
599 DIRECTION, we must update the entry REVERSE-CHARSET of both
600 charsets. If there's no such charset, the value of the entry
601 is set to nil. */
602 int i;
603
604 for (i = 0; i <= MAX_CHARSET; i++)
605 if (!NILP (CHARSET_TABLE_ENTRY (i)))
606 {
607 if (CHARSET_DIMENSION (i) == XINT (dimension)
608 && CHARSET_CHARS (i) == XINT (chars)
609 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
610 && CHARSET_DIRECTION (i) != XINT (direction))
611 {
612 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
613 = make_number (i);
614 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
615 break;
616 }
617 }
618 if (i > MAX_CHARSET)
619 /* No such a charset. */
620 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
621 = make_number (-1);
622 }
623
624 if (charset != CHARSET_ASCII
625 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
626 {
627 width_by_char_head[leading_code_base] = XINT (width);
628
629 /* Update table emacs_code_class. */
630 emacs_code_class[charset] = (bytes == 2
631 ? EMACS_leading_code_2
632 : (bytes == 3
633 ? EMACS_leading_code_3
634 : EMACS_leading_code_4));
635 }
636
637 /* Update table iso_charset_table. */
638 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
639 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
640 }
641
642 #ifdef emacs
643
644 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
645 is invalid. */
646 int
647 get_charset_id (charset_symbol)
648 Lisp_Object charset_symbol;
649 {
650 Lisp_Object val;
651 int charset;
652
653 return ((SYMBOLP (charset_symbol)
654 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
655 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
656 CHARSET_VALID_P (charset)))
657 ? charset : -1);
658 }
659
660 /* Return an identification number for a new private charset of
661 DIMENSION and WIDTH. If there's no more room for the new charset,
662 return 0. */
663 Lisp_Object
664 get_new_private_charset_id (dimension, width)
665 int dimension, width;
666 {
667 int charset, from, to;
668
669 if (dimension == 1)
670 {
671 if (width == 1)
672 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
673 else
674 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
675 }
676 else
677 {
678 if (width == 1)
679 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
680 else
681 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
682 }
683
684 for (charset = from; charset < to; charset++)
685 if (!CHARSET_DEFINED_P (charset)) break;
686
687 return make_number (charset < to ? charset : 0);
688 }
689
690 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
691 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
692 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
693 treated as a private charset.\n\
694 INFO-VECTOR is a vector of the format:\n\
695 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
696 SHORT-NAME LONG-NAME DESCRIPTION]\n\
697 The meanings of each elements is as follows:\n\
698 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
699 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
700 WIDTH (integer) is the number of columns a character in the charset\n\
701 occupies on the screen: one of 0, 1, and 2.\n\
702 \n\
703 DIRECTION (integer) is the rendering direction of characters in the\n\
704 charset when rendering. If 0, render from left to right, else\n\
705 render from right to left.\n\
706 \n\
707 ISO-FINAL-CHAR (character) is the final character of the\n\
708 corresponding ISO 2022 charset.\n\
709 \n\
710 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
711 while encoding to variants of ISO 2022 coding system, one of the\n\
712 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
713 \n\
714 SHORT-NAME (string) is the short name to refer to the charset.\n\
715 \n\
716 LONG-NAME (string) is the long name to refer to the charset.\n\
717 \n\
718 DESCRIPTION (string) is the description string of the charset.")
719 (charset_id, charset_symbol, info_vector)
720 Lisp_Object charset_id, charset_symbol, info_vector;
721 {
722 Lisp_Object *vec;
723
724 if (!NILP (charset_id))
725 CHECK_NUMBER (charset_id, 0);
726 CHECK_SYMBOL (charset_symbol, 1);
727 CHECK_VECTOR (info_vector, 2);
728
729 if (! NILP (charset_id))
730 {
731 if (! CHARSET_VALID_P (XINT (charset_id)))
732 error ("Invalid CHARSET: %d", XINT (charset_id));
733 else if (CHARSET_DEFINED_P (XINT (charset_id)))
734 error ("Already defined charset: %d", XINT (charset_id));
735 }
736
737 vec = XVECTOR (info_vector)->contents;
738 if (XVECTOR (info_vector)->size != 9
739 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
740 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
741 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
742 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
743 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
744 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
745 || !STRINGP (vec[6])
746 || !STRINGP (vec[7])
747 || !STRINGP (vec[8]))
748 error ("Invalid info-vector argument for defining charset %s",
749 XSYMBOL (charset_symbol)->name->data);
750
751 if (NILP (charset_id))
752 {
753 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
754 if (XINT (charset_id) == 0)
755 error ("There's no room for a new private charset %s",
756 XSYMBOL (charset_symbol)->name->data);
757 }
758
759 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
760 vec[4], vec[5], vec[6], vec[7], vec[8]);
761 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
762 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
763 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
764 return Qnil;
765 }
766
767 DEFUN ("generic-character-list", Fgeneric_character_list,
768 Sgeneric_character_list, 0, 0, 0,
769 "Return a list of all possible generic characters.\n\
770 It includes a generic character for a charset not yet defined.")
771 ()
772 {
773 return Vgeneric_character_list;
774 }
775
776 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
777 Sget_unused_iso_final_char, 2, 2, 0,
778 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
779 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
780 CHARS is the number of characters in a dimension: 94 or 96.\n\
781 \n\
782 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
783 If there's no unused final char for the specified kind of charset,\n\
784 return nil.")
785 (dimension, chars)
786 Lisp_Object dimension, chars;
787 {
788 int final_char;
789
790 CHECK_NUMBER (dimension, 0);
791 CHECK_NUMBER (chars, 1);
792 if (XINT (dimension) != 1 && XINT (dimension) != 2)
793 error ("Invalid charset dimension %d, it should be 1 or 2",
794 XINT (dimension));
795 if (XINT (chars) != 94 && XINT (chars) != 96)
796 error ("Invalid charset chars %d, it should be 94 or 96",
797 XINT (chars));
798 for (final_char = '0'; final_char <= '?'; final_char++)
799 {
800 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
801 break;
802 }
803 return (final_char <= '?' ? make_number (final_char) : Qnil);
804 }
805
806 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
807 4, 4, 0,
808 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
809 CHARSET should be defined by `defined-charset' in advance.")
810 (dimension, chars, final_char, charset_symbol)
811 Lisp_Object dimension, chars, final_char, charset_symbol;
812 {
813 int charset;
814
815 CHECK_NUMBER (dimension, 0);
816 CHECK_NUMBER (chars, 1);
817 CHECK_NUMBER (final_char, 2);
818 CHECK_SYMBOL (charset_symbol, 3);
819
820 if (XINT (dimension) != 1 && XINT (dimension) != 2)
821 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
822 if (XINT (chars) != 94 && XINT (chars) != 96)
823 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
824 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
825 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
826 if ((charset = get_charset_id (charset_symbol)) < 0)
827 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
828
829 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
830 return Qnil;
831 }
832
833 /* Return number of different charsets in STR of length LEN. In
834 addition, for each found charset N, CHARSETS[N] is set 1. The
835 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
836 It may lookup a translation table TABLE if supplied.
837
838 If CMPCHARP is nonzero and some composite character is found,
839 CHARSETS[128] is also set 1 and the returned number is incremented
840 by 1.
841
842 If MULTIBYTE is zero, do not check multibyte characters, i.e. if
843 any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
844 8-bit codes are found CHARSET[1] is set to 1. */
845
846 int
847 find_charset_in_str (str, len, charsets, table, cmpcharp, multibyte)
848 unsigned char *str;
849 int len, *charsets;
850 Lisp_Object table;
851 int cmpcharp;
852 int multibyte;
853 {
854 register int num = 0, c;
855
856 if (! multibyte)
857 {
858 unsigned char *endp = str + len;
859 int maskbits = 0;
860
861 while (str < endp && maskbits != 3)
862 maskbits |= (*str++ < 0x80 ? 1 : 2);
863 if (maskbits & 1)
864 {
865 charsets[0] = 1;
866 num++;
867 }
868 if (maskbits & 2)
869 {
870 charsets[1] = 1;
871 num++;
872 }
873 return num;
874 }
875
876 if (! CHAR_TABLE_P (table))
877 table = Qnil;
878
879 while (len > 0)
880 {
881 int bytes, charset;
882 c = *str;
883
884 if (c == LEADING_CODE_COMPOSITION)
885 {
886 int cmpchar_id = str_cmpchar_id (str, len);
887 GLYPH *glyph;
888
889 if (cmpchar_id >= 0)
890 {
891 struct cmpchar_info *cmp_p = cmpchar_table[cmpchar_id];
892 int i;
893
894 for (i = 0; i < cmp_p->glyph_len; i++)
895 {
896 c = cmp_p->glyph[i];
897 if (!NILP (table))
898 {
899 if ((c = translate_char (table, c, 0, 0, 0)) < 0)
900 c = cmp_p->glyph[i];
901 }
902 if ((charset = CHAR_CHARSET (c)) < 0)
903 charset = CHARSET_ASCII;
904 if (!charsets[charset])
905 {
906 charsets[charset] = 1;
907 num += 1;
908 }
909 }
910 str += cmp_p->len;
911 len -= cmp_p->len;
912 if (cmpcharp && !charsets[CHARSET_COMPOSITION])
913 {
914 charsets[CHARSET_COMPOSITION] = 1;
915 num += 1;
916 }
917 continue;
918 }
919
920 charset = 1; /* This leads to `unknown' charset. */
921 bytes = 1;
922 }
923 else
924 {
925 c = STRING_CHAR_AND_LENGTH (str, len, bytes);
926 if (! NILP (table))
927 {
928 int c1 = translate_char (table, c, 0, 0, 0);
929 if (c1 >= 0)
930 c = c1;
931 }
932 charset = CHAR_CHARSET (c);
933 }
934
935 if (!charsets[charset])
936 {
937 charsets[charset] = 1;
938 num += 1;
939 }
940 str += bytes;
941 len -= bytes;
942 }
943 return num;
944 }
945
946 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
947 2, 3, 0,
948 "Return a list of charsets in the region between BEG and END.\n\
949 BEG and END are buffer positions.\n\
950 If the region contains any composite character,\n\
951 `composition' is included in the returned list.\n\
952 Optional arg TABLE if non-nil is a translation table to look up.\n\
953 \n\
954 If the region contains invalid multiybte characters,\n\
955 `unknown' is included in the returned list.\n\
956 \n\
957 If the current buffer is unibyte, the returned list contains\n\
958 `ascii' if any 7-bit characters are found,\n\
959 and `unknown' if any 8-bit characters are found.")
960 (beg, end, table)
961 Lisp_Object beg, end, table;
962 {
963 int charsets[MAX_CHARSET + 1];
964 int from, from_byte, to, stop, stop_byte, i;
965 Lisp_Object val;
966 int undefined;
967 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
968
969 validate_region (&beg, &end);
970 from = XFASTINT (beg);
971 stop = to = XFASTINT (end);
972
973 if (from < GPT && GPT < to)
974 {
975 stop = GPT;
976 stop_byte = GPT_BYTE;
977 }
978 else
979 stop_byte = CHAR_TO_BYTE (stop);
980
981 from_byte = CHAR_TO_BYTE (from);
982
983 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
984 while (1)
985 {
986 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
987 charsets, table, 1, multibyte);
988 if (stop < to)
989 {
990 from = stop, from_byte = stop_byte;
991 stop = to, stop_byte = CHAR_TO_BYTE (stop);
992 }
993 else
994 break;
995 }
996
997 val = Qnil;
998 undefined = 0;
999 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
1000 if (charsets[i])
1001 {
1002 if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
1003 val = Fcons (CHARSET_SYMBOL (i), val);
1004 else
1005 undefined = 1;
1006 }
1007 if (undefined)
1008 val = Fcons (Qunknown, val);
1009 return val;
1010 }
1011
1012 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
1013 1, 2, 0,
1014 "Return a list of charsets in STR.\n\
1015 If the string contains any composite characters,\n\
1016 `composition' is included in the returned list.\n\
1017 Optional arg TABLE if non-nil is a translation table to look up.\n\
1018 \n\
1019 If the region contains invalid multiybte characters,\n\
1020 `unknown' is included in the returned list.\n\
1021 \n\
1022 If STR is unibyte, the returned list contains\n\
1023 `ascii' if any 7-bit characters are found,\n\
1024 and `unknown' if any 8-bit characters are found.")
1025 (str, table)
1026 Lisp_Object str, table;
1027 {
1028 int charsets[MAX_CHARSET + 1];
1029 int i;
1030 Lisp_Object val;
1031 int undefined;
1032 int multibyte;
1033
1034 CHECK_STRING (str, 0);
1035 multibyte = STRING_MULTIBYTE (str);
1036
1037 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
1038 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
1039 charsets, table, 1, multibyte);
1040 val = Qnil;
1041 undefined = 0;
1042 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
1043 if (charsets[i])
1044 {
1045 if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
1046 val = Fcons (CHARSET_SYMBOL (i), val);
1047 else
1048 undefined = 1;
1049 }
1050 if (undefined)
1051 val = Fcons (Qunknown, val);
1052 return val;
1053 }
1054 \f
1055 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
1056 "")
1057 (charset, code1, code2)
1058 Lisp_Object charset, code1, code2;
1059 {
1060 int charset_id, c1, c2;
1061
1062 CHECK_NUMBER (charset, 0);
1063 charset_id = XINT (charset);
1064 if (!CHARSET_DEFINED_P (charset_id))
1065 error ("Invalid charset ID: %d", XINT (charset));
1066
1067 if (NILP (code1))
1068 c1 = 0;
1069 else
1070 {
1071 CHECK_NUMBER (code1, 1);
1072 c1 = XINT (code1);
1073 }
1074 if (NILP (code2))
1075 c2 = 0;
1076 else
1077 {
1078 CHECK_NUMBER (code2, 2);
1079 c2 = XINT (code2);
1080 }
1081
1082 if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1083 error ("Invalid code points: %d %d", c1, c2);
1084 c1 &= 0x7F;
1085 c2 &= 0x7F;
1086 if (c1 == 0
1087 ? c2 != 0
1088 : (c2 == 0
1089 ? !CHAR_COMPONENTS_VALID_P (charset, c1, 0x20)
1090 : !CHAR_COMPONENTS_VALID_P (charset, c1, c2)))
1091 error ("Invalid code points: %d %d", c1, c2);
1092
1093 return make_number (MAKE_CHAR (charset_id, c1, c2));
1094 }
1095
1096 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1097 "Return list of charset and one or two position-codes of CHAR.\n\
1098 If CHAR is invalid as a character code,\n\
1099 return a list of symbol `unknown' and CHAR.")
1100 (ch)
1101 Lisp_Object ch;
1102 {
1103 Lisp_Object val;
1104 int c, charset, c1, c2;
1105
1106 CHECK_NUMBER (ch, 0);
1107 c = XFASTINT (ch);
1108 if (!CHAR_VALID_P (c, 1))
1109 return Fcons (Qunknown, Fcons (ch, Qnil));
1110 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1111 return (c2 >= 0
1112 ? Fcons (CHARSET_SYMBOL (charset),
1113 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1114 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1115 }
1116
1117 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1118 "Return charset of CHAR.")
1119 (ch)
1120 Lisp_Object ch;
1121 {
1122 CHECK_NUMBER (ch, 0);
1123
1124 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1125 }
1126
1127 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1128 "Return charset of a character in the current buffer at position POS.\n\
1129 If POS is nil, it defauls to the current point.\n\
1130 If POS is out of range, the value is nil.")
1131 (pos)
1132 Lisp_Object pos;
1133 {
1134 register int pos_byte, bytes, charset, c1, c2;
1135 register unsigned char *p;
1136
1137 if (NILP (pos))
1138 pos_byte = PT_BYTE;
1139 else if (MARKERP (pos))
1140 {
1141 pos_byte = marker_byte_position (pos);
1142 if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1143 return Qnil;
1144 }
1145 else
1146 {
1147 CHECK_NUMBER (pos, 0);
1148 if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1149 return Qnil;
1150 pos_byte = CHAR_TO_BYTE (XINT (pos));
1151 }
1152 p = BYTE_POS_ADDR (pos_byte);
1153 if (BASE_LEADING_CODE_P (*p))
1154 {
1155 SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1156 if (charset < 0)
1157 charset = 1;
1158 }
1159 else
1160 charset = CHARSET_ASCII;
1161
1162 return CHARSET_SYMBOL (charset);
1163 }
1164
1165 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1166 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1167 \n\
1168 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1169 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1170 where as Emacs distinguishes them by charset symbol.\n\
1171 See the documentation of the function `charset-info' for the meanings of\n\
1172 DIMENSION, CHARS, and FINAL-CHAR.")
1173 (dimension, chars, final_char)
1174 Lisp_Object dimension, chars, final_char;
1175 {
1176 int charset;
1177
1178 CHECK_NUMBER (dimension, 0);
1179 CHECK_NUMBER (chars, 1);
1180 CHECK_NUMBER (final_char, 2);
1181
1182 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1183 return Qnil;
1184 return CHARSET_SYMBOL (charset);
1185 }
1186
1187 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1188 generic character. If GENERICP is zero, return nonzero iff C is a
1189 valid normal character. Do not call this function directly,
1190 instead use macro CHAR_VALID_P. */
1191 int
1192 char_valid_p (c, genericp)
1193 int c, genericp;
1194 {
1195 int charset, c1, c2;
1196
1197 if (c < 0)
1198 return 0;
1199 if (SINGLE_BYTE_CHAR_P (c))
1200 return 1;
1201 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
1202 if (charset == CHARSET_COMPOSITION)
1203 return ((c >= MIN_CHAR_COMPOSITION
1204 && c < MIN_CHAR_COMPOSITION + n_cmpchars)
1205 || (genericp && c == GENERIC_COMPOSITION_CHAR));
1206 if (genericp)
1207 {
1208 if (c1)
1209 {
1210 if (c2 <= 0) c2 = 0x20;
1211 }
1212 else
1213 {
1214 if (c2 <= 0) c1 = c2 = 0x20;
1215 }
1216 }
1217 return (CHARSET_DEFINED_P (charset)
1218 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1219 }
1220
1221 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1222 "Return t if OBJECT is a valid normal character.\n\
1223 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1224 a valid generic character.")
1225 (object, genericp)
1226 Lisp_Object object, genericp;
1227 {
1228 if (! NATNUMP (object))
1229 return Qnil;
1230 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1231 }
1232
1233 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1234 Sunibyte_char_to_multibyte, 1, 1, 0,
1235 "Convert the unibyte character CH to multibyte character.\n\
1236 The conversion is done based on `nonascii-translation-table' (which see)\n\
1237 or `nonascii-insert-offset' (which see).")
1238 (ch)
1239 Lisp_Object ch;
1240 {
1241 int c;
1242
1243 CHECK_NUMBER (ch, 0);
1244 c = XINT (ch);
1245 if (c < 0 || c >= 0400)
1246 error ("Invalid unibyte character: %d", c);
1247 c = unibyte_char_to_multibyte (c);
1248 if (c < 0)
1249 error ("Can't convert to multibyte character: %d", XINT (ch));
1250 return make_number (c);
1251 }
1252
1253 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1254 Smultibyte_char_to_unibyte, 1, 1, 0,
1255 "Convert the multibyte character CH to unibyte character.\n\
1256 The conversion is done based on `nonascii-translation-table' (which see)\n\
1257 or `nonascii-insert-offset' (which see).")
1258 (ch)
1259 Lisp_Object ch;
1260 {
1261 int c;
1262
1263 CHECK_NUMBER (ch, 0);
1264 c = XINT (ch);
1265 if (! CHAR_VALID_P (c, 0))
1266 error ("Invalid multibyte character: %d", c);
1267 c = multibyte_char_to_unibyte (c, Qnil);
1268 if (c < 0)
1269 error ("Can't convert to unibyte character: %d", XINT (ch));
1270 return make_number (c);
1271 }
1272
1273 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1274 "Return 1 regardless of the argument CHAR.\n\
1275 This is now an obsolete function. We keep it just for backward compatibility.")
1276 (ch)
1277 Lisp_Object ch;
1278 {
1279 Lisp_Object val;
1280
1281 CHECK_NUMBER (ch, 0);
1282 return make_number (1);
1283 }
1284
1285 /* Return how many bytes C will occupy in a multibyte buffer.
1286 Don't call this function directly, instead use macro CHAR_BYTES. */
1287 int
1288 char_bytes (c)
1289 int c;
1290 {
1291 int bytes;
1292
1293 if (SINGLE_BYTE_CHAR_P (c) || (c & ~GLYPH_MASK_CHAR))
1294 return 1;
1295
1296 if (COMPOSITE_CHAR_P (c))
1297 {
1298 unsigned int id = COMPOSITE_CHAR_ID (c);
1299
1300 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
1301 }
1302 else
1303 {
1304 int charset = CHAR_CHARSET (c);
1305
1306 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
1307 }
1308
1309 return bytes;
1310 }
1311
1312 /* Return the width of character of which multi-byte form starts with
1313 C. The width is measured by how many columns occupied on the
1314 screen when displayed in the current buffer. */
1315
1316 #define ONE_BYTE_CHAR_WIDTH(c) \
1317 (c < 0x20 \
1318 ? (c == '\t' \
1319 ? XFASTINT (current_buffer->tab_width) \
1320 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1321 : (c < 0x7f \
1322 ? 1 \
1323 : (c == 0x7F \
1324 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1325 : ((! NILP (current_buffer->enable_multibyte_characters) \
1326 && BASE_LEADING_CODE_P (c)) \
1327 ? WIDTH_BY_CHAR_HEAD (c) \
1328 : 4))))
1329
1330 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1331 "Return width of CHAR when displayed in the current buffer.\n\
1332 The width is measured by how many columns it occupies on the screen.")
1333 (ch)
1334 Lisp_Object ch;
1335 {
1336 Lisp_Object val, disp;
1337 int c;
1338 struct Lisp_Char_Table *dp = buffer_display_table ();
1339
1340 CHECK_NUMBER (ch, 0);
1341
1342 c = XINT (ch);
1343
1344 /* Get the way the display table would display it. */
1345 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1346
1347 if (VECTORP (disp))
1348 XSETINT (val, XVECTOR (disp)->size);
1349 else if (SINGLE_BYTE_CHAR_P (c))
1350 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1351 else if (COMPOSITE_CHAR_P (c))
1352 {
1353 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
1354 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 1));
1355 }
1356 else
1357 {
1358 int charset = CHAR_CHARSET (c);
1359
1360 XSETFASTINT (val, CHARSET_WIDTH (charset));
1361 }
1362 return val;
1363 }
1364
1365 /* Return width of string STR of length LEN when displayed in the
1366 current buffer. The width is measured by how many columns it
1367 occupies on the screen. */
1368
1369 int
1370 strwidth (str, len)
1371 unsigned char *str;
1372 int len;
1373 {
1374 unsigned char *endp = str + len;
1375 int width = 0;
1376 struct Lisp_Char_Table *dp = buffer_display_table ();
1377
1378 while (str < endp)
1379 {
1380 if (*str == LEADING_CODE_COMPOSITION)
1381 {
1382 int id = str_cmpchar_id (str, endp - str);
1383
1384 if (id < 0)
1385 {
1386 width += 4;
1387 str++;
1388 }
1389 else
1390 {
1391 width += cmpchar_table[id]->width;
1392 str += cmpchar_table[id]->len;
1393 }
1394 }
1395 else
1396 {
1397 Lisp_Object disp;
1398 int thislen;
1399 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1400
1401 /* Get the way the display table would display it. */
1402 if (dp)
1403 disp = DISP_CHAR_VECTOR (dp, c);
1404 else
1405 disp = Qnil;
1406
1407 if (VECTORP (disp))
1408 width += XVECTOR (disp)->size;
1409 else
1410 width += ONE_BYTE_CHAR_WIDTH (*str);
1411
1412 str += thislen;
1413 }
1414 }
1415 return width;
1416 }
1417
1418 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1419 "Return width of STRING when displayed in the current buffer.\n\
1420 Width is measured by how many columns it occupies on the screen.\n\
1421 When calculating width of a multibyte character in STRING,\n\
1422 only the base leading-code is considered; the validity of\n\
1423 the following bytes is not checked.")
1424 (str)
1425 Lisp_Object str;
1426 {
1427 Lisp_Object val;
1428
1429 CHECK_STRING (str, 0);
1430 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1431 STRING_BYTES (XSTRING (str))));
1432 return val;
1433 }
1434
1435 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1436 "Return the direction of CHAR.\n\
1437 The returned value is 0 for left-to-right and 1 for right-to-left.")
1438 (ch)
1439 Lisp_Object ch;
1440 {
1441 int charset;
1442
1443 CHECK_NUMBER (ch, 0);
1444 charset = CHAR_CHARSET (XFASTINT (ch));
1445 if (!CHARSET_DEFINED_P (charset))
1446 invalid_character (XINT (ch));
1447 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1448 }
1449
1450 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1451 "Return number of characters between BEG and END.")
1452 (beg, end)
1453 Lisp_Object beg, end;
1454 {
1455 int from, to;
1456
1457 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1458 CHECK_NUMBER_COERCE_MARKER (end, 1);
1459
1460 from = min (XFASTINT (beg), XFASTINT (end));
1461 to = max (XFASTINT (beg), XFASTINT (end));
1462
1463 return make_number (to - from);
1464 }
1465
1466 /* Return the number of characters in the NBYTES bytes at PTR.
1467 This works by looking at the contents and checking for multibyte sequences.
1468 However, if the current buffer has enable-multibyte-characters = nil,
1469 we treat each byte as a character. */
1470
1471 int
1472 chars_in_text (ptr, nbytes)
1473 unsigned char *ptr;
1474 int nbytes;
1475 {
1476 /* current_buffer is null at early stages of Emacs initialization. */
1477 if (current_buffer == 0
1478 || NILP (current_buffer->enable_multibyte_characters))
1479 return nbytes;
1480
1481 return multibyte_chars_in_text (ptr, nbytes);
1482 }
1483
1484 /* Return the number of characters in the NBYTES bytes at PTR.
1485 This works by looking at the contents and checking for multibyte sequences.
1486 It ignores enable-multibyte-characters. */
1487
1488 int
1489 multibyte_chars_in_text (ptr, nbytes)
1490 unsigned char *ptr;
1491 int nbytes;
1492 {
1493 unsigned char *endp;
1494 int chars, bytes;
1495
1496 endp = ptr + nbytes;
1497 chars = 0;
1498
1499 while (ptr < endp)
1500 {
1501 if (BASE_LEADING_CODE_P (*ptr))
1502 {
1503 PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1504 ptr += bytes;
1505 nbytes -= bytes;
1506 }
1507 else
1508 {
1509 ptr++;
1510 nbytes--;
1511 }
1512 chars++;
1513 }
1514
1515 return chars;
1516 }
1517
1518 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1519 "Concatenate all the argument characters and make the result a string.")
1520 (n, args)
1521 int n;
1522 Lisp_Object *args;
1523 {
1524 int i;
1525 unsigned char *buf
1526 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
1527 unsigned char *p = buf;
1528 Lisp_Object val;
1529
1530 for (i = 0; i < n; i++)
1531 {
1532 int c, len;
1533 unsigned char *str;
1534
1535 if (!INTEGERP (args[i]))
1536 CHECK_NUMBER (args[i], 0);
1537 c = XINT (args[i]);
1538 len = CHAR_STRING (c, p, str);
1539 if (p != str)
1540 /* C is a composite character. */
1541 bcopy (str, p, len);
1542 p += len;
1543 }
1544
1545 /* Here, we can't use make_string_from_bytes because of byte
1546 combining problem. */
1547 val = make_string (buf, p - buf);
1548 return val;
1549 }
1550
1551 #endif /* emacs */
1552 \f
1553 /*** Composite characters staffs ***/
1554
1555 /* Each composite character is identified by CMPCHAR-ID which is
1556 assigned when Emacs needs the character code of the composite
1557 character (e.g. when displaying it on the screen). See the
1558 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1559 composite character is represented in Emacs. */
1560
1561 /* If `static' is defined, it means that it is defined to null string. */
1562 #ifndef static
1563 /* The following function is copied from lread.c. */
1564 static int
1565 hash_string (ptr, len)
1566 unsigned char *ptr;
1567 int len;
1568 {
1569 register unsigned char *p = ptr;
1570 register unsigned char *end = p + len;
1571 register unsigned char c;
1572 register int hash = 0;
1573
1574 while (p != end)
1575 {
1576 c = *p++;
1577 if (c >= 0140) c -= 40;
1578 hash = ((hash<<3) + (hash>>28) + c);
1579 }
1580 return hash & 07777777777;
1581 }
1582 #endif
1583
1584 #define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1585
1586 static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1587
1588 /* Each element of `cmpchar_hash_table' is a pointer to an array of
1589 integer, where the 1st element is the size of the array, the 2nd
1590 element is how many elements are actually used in the array, and
1591 the remaining elements are CMPCHAR-IDs of composite characters of
1592 the same hash value. */
1593 #define CMPCHAR_HASH_SIZE(table) table[0]
1594 #define CMPCHAR_HASH_USED(table) table[1]
1595 #define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1596
1597 /* Return CMPCHAR-ID of the composite character in STR of the length
1598 LEN. If the composite character has not yet been registered,
1599 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1600 is the sole function for assigning CMPCHAR-ID. */
1601 int
1602 str_cmpchar_id (str, len)
1603 const unsigned char *str;
1604 int len;
1605 {
1606 int hash_idx, *hashp;
1607 unsigned char *buf;
1608 int embedded_rule; /* 1 if composition rule is embedded. */
1609 int chars; /* number of components. */
1610 int i;
1611 struct cmpchar_info *cmpcharp;
1612
1613 /* The second byte 0xFF means COMPOSITION rule is embedded. */
1614 embedded_rule = (str[1] == 0xFF);
1615
1616 /* At first, get the actual length of the composite character. */
1617 {
1618 const unsigned char *p, *endp = str + 1, *lastp = str + len;
1619 int bytes;
1620
1621 while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
1622 if (endp - str < 5)
1623 /* Any composite char have at least 5-byte length. */
1624 return -1;
1625
1626 chars = 0;
1627 p = str + 1;
1628 while (p < endp)
1629 {
1630 if (embedded_rule)
1631 {
1632 p++;
1633 if (p >= endp)
1634 return -1;
1635 }
1636 /* No need of checking if *P is 0xA0 because
1637 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1638 p += BYTES_BY_CHAR_HEAD (*p - 0x20);
1639 chars++;
1640 }
1641 if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1642 /* Invalid components. */
1643 return -1;
1644 len = p - str;
1645 }
1646 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1647 hashp = cmpchar_hash_table[hash_idx];
1648
1649 /* Then, look into the hash table. */
1650 if (hashp != NULL)
1651 /* Find the correct one among composite characters of the same
1652 hash value. */
1653 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1654 {
1655 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1656 if (len == cmpcharp->len
1657 && ! bcmp (str, cmpcharp->data, len))
1658 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1659 }
1660
1661 /* We have to register the composite character in cmpchar_table. */
1662 if (n_cmpchars >= (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
1663 /* No, we have no more room for a new composite character. */
1664 return -1;
1665
1666 /* Make the entry in hash table. */
1667 if (hashp == NULL)
1668 {
1669 /* Make a table for 8 composite characters initially. */
1670 hashp = (cmpchar_hash_table[hash_idx]
1671 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1672 CMPCHAR_HASH_SIZE (hashp) = 10;
1673 CMPCHAR_HASH_USED (hashp) = 2;
1674 }
1675 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1676 {
1677 CMPCHAR_HASH_SIZE (hashp) += 8;
1678 hashp = (cmpchar_hash_table[hash_idx]
1679 = (int *) xrealloc (hashp,
1680 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1681 }
1682 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1683 CMPCHAR_HASH_USED (hashp)++;
1684
1685 /* Set information of the composite character in cmpchar_table. */
1686 if (cmpchar_table_size == 0)
1687 {
1688 /* This is the first composite character to be registered. */
1689 cmpchar_table_size = 256;
1690 cmpchar_table
1691 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1692 * cmpchar_table_size);
1693 }
1694 else if (cmpchar_table_size <= n_cmpchars)
1695 {
1696 cmpchar_table_size += 256;
1697 cmpchar_table
1698 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1699 sizeof (cmpchar_table[0])
1700 * cmpchar_table_size);
1701 }
1702
1703 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1704
1705 cmpcharp->len = len;
1706 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1707 bcopy (str, cmpcharp->data, len);
1708 cmpcharp->data[len] = 0;
1709 cmpcharp->glyph_len = chars;
1710 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1711 if (embedded_rule)
1712 {
1713 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1714 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1715 }
1716 else
1717 {
1718 cmpcharp->cmp_rule = NULL;
1719 cmpcharp->col_offset = NULL;
1720 }
1721
1722 /* Setup GLYPH data and composition rules (if any) so as not to make
1723 them every time on displaying. */
1724 {
1725 unsigned char *bufp;
1726 int width;
1727 float leftmost = 0.0, rightmost = 1.0;
1728
1729 if (embedded_rule)
1730 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1731 cmpcharp->col_offset[0] = 0;
1732
1733 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1734 {
1735 if (embedded_rule)
1736 cmpcharp->cmp_rule[i] = *bufp++;
1737
1738 if (*bufp == 0xA0) /* This is an ASCII character. */
1739 {
1740 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1741 width = 1;
1742 bufp++;
1743 }
1744 else /* Multibyte character. */
1745 {
1746 /* Make `bufp' point normal multi-byte form temporally. */
1747 *bufp -= 0x20;
1748 cmpcharp->glyph[i]
1749 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
1750 width = WIDTH_BY_CHAR_HEAD (*bufp);
1751 *bufp += 0x20;
1752 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1753 }
1754
1755 if (embedded_rule && i > 0)
1756 {
1757 /* Reference points (global_ref and new_ref) are
1758 encoded as below:
1759
1760 0--1--2 -- ascent
1761 | |
1762 | |
1763 | 4 -+--- center
1764 -- 3 5 -- baseline
1765 | |
1766 6--7--8 -- descent
1767
1768 Now, we calculate the column offset of the new glyph
1769 from the left edge of the first glyph. This can avoid
1770 the same calculation everytime displaying this
1771 composite character. */
1772
1773 /* Reference points of global glyph and new glyph. */
1774 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1775 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1776 /* Column offset relative to the first glyph. */
1777 float left = (leftmost
1778 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1779 - (new_ref % 3) * width / 2.0);
1780
1781 cmpcharp->col_offset[i] = left;
1782 if (left < leftmost)
1783 leftmost = left;
1784 if (left + width > rightmost)
1785 rightmost = left + width;
1786 }
1787 else
1788 {
1789 if (width > rightmost)
1790 rightmost = width;
1791 }
1792 }
1793 if (embedded_rule)
1794 {
1795 /* Now col_offset[N] are relative to the left edge of the
1796 first component. Make them relative to the left edge of
1797 overall glyph. */
1798 for (i = 0; i < chars; i++)
1799 cmpcharp->col_offset[i] -= leftmost;
1800 /* Make rightmost holds width of overall glyph. */
1801 rightmost -= leftmost;
1802 }
1803
1804 cmpcharp->width = rightmost;
1805 if (cmpcharp->width < rightmost)
1806 /* To get a ceiling integer value. */
1807 cmpcharp->width++;
1808 }
1809
1810 cmpchar_table[n_cmpchars] = cmpcharp;
1811
1812 return n_cmpchars++;
1813 }
1814
1815 /* Return the Nth element of the composite character C. If NOERROR is
1816 nonzero, return 0 on error condition (C is an invalid composite
1817 charcter, or N is out of range). */
1818 int
1819 cmpchar_component (c, n, noerror)
1820 int c, n, noerror;
1821 {
1822 int id = COMPOSITE_CHAR_ID (c);
1823
1824 if (id < 0 || id >= n_cmpchars)
1825 {
1826 /* C is not a valid composite character. */
1827 if (noerror) return 0;
1828 error ("Invalid composite character: %d", c) ;
1829 }
1830 if (n >= cmpchar_table[id]->glyph_len)
1831 {
1832 /* No such component. */
1833 if (noerror) return 0;
1834 args_out_of_range (make_number (c), make_number (n));
1835 }
1836 /* No face data is stored in glyph code. */
1837 return ((int) (cmpchar_table[id]->glyph[n]));
1838 }
1839
1840 DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1841 "T if CHAR is a composite character.")
1842 (ch)
1843 Lisp_Object ch;
1844 {
1845 CHECK_NUMBER (ch, 0);
1846 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1847 }
1848
1849 DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1850 2, 2, 0,
1851 "Return the Nth component character of composite character CHARACTER.")
1852 (character, n)
1853 Lisp_Object character, n;
1854 {
1855 int id;
1856
1857 CHECK_NUMBER (character, 0);
1858 CHECK_NUMBER (n, 1);
1859
1860 return (make_number (cmpchar_component (XINT (character), XINT (n), 0)));
1861 }
1862
1863 DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1864 2, 2, 0,
1865 "Return the Nth composition rule of composite character CHARACTER.\n\
1866 The returned rule is for composing the Nth component\n\
1867 on the (N-1)th component.\n\
1868 If CHARACTER should be composed relatively or N is 0, return 255.")
1869 (character, n)
1870 Lisp_Object character, n;
1871 {
1872 int id;
1873
1874 CHECK_NUMBER (character, 0);
1875 CHECK_NUMBER (n, 1);
1876
1877 id = COMPOSITE_CHAR_ID (XINT (character));
1878 if (id < 0 || id >= n_cmpchars)
1879 error ("Invalid composite character: %d", XINT (character));
1880 if (XINT (n) < 0 || XINT (n) >= cmpchar_table[id]->glyph_len)
1881 args_out_of_range (character, n);
1882
1883 return make_number (cmpchar_table[id]->cmp_rule
1884 ? cmpchar_table[id]->cmp_rule[XINT (n)]
1885 : 255);
1886 }
1887
1888 DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1889 Scmpchar_cmp_rule_p, 1, 1, 0,
1890 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1891 (character)
1892 Lisp_Object character;
1893 {
1894 int id;
1895
1896 CHECK_NUMBER (character, 0);
1897 id = COMPOSITE_CHAR_ID (XINT (character));
1898 if (id < 0 || id >= n_cmpchars)
1899 error ("Invalid composite character: %d", XINT (character));
1900
1901 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1902 }
1903
1904 DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1905 Scmpchar_cmp_count, 1, 1, 0,
1906 "Return number of compoents of composite character CHARACTER.")
1907 (character)
1908 Lisp_Object character;
1909 {
1910 int id;
1911
1912 CHECK_NUMBER (character, 0);
1913 id = COMPOSITE_CHAR_ID (XINT (character));
1914 if (id < 0 || id >= n_cmpchars)
1915 error ("Invalid composite character: %d", XINT (character));
1916
1917 return (make_number (cmpchar_table[id]->glyph_len));
1918 }
1919
1920 DEFUN ("compose-string", Fcompose_string, Scompose_string,
1921 1, 1, 0,
1922 "Return one char string composed from all characters in STRING.")
1923 (str)
1924 Lisp_Object str;
1925 {
1926 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1927 int len, i;
1928
1929 CHECK_STRING (str, 0);
1930
1931 buf[0] = LEADING_CODE_COMPOSITION;
1932 p = XSTRING (str)->data;
1933 pend = p + STRING_BYTES (XSTRING (str));
1934 i = 1;
1935 while (p < pend)
1936 {
1937 if (*p < 0x20) /* control code */
1938 error ("Invalid component character: %d", *p);
1939 else if (*p < 0x80) /* ASCII */
1940 {
1941 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1942 error ("Too long string to be composed: %s", XSTRING (str)->data);
1943 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1944 code itself. */
1945 buf[i++] = 0xA0;
1946 buf[i++] = *p++ + 0x80;
1947 }
1948 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1949 {
1950 /* Already composed. Eliminate the heading
1951 LEADING_CODE_COMPOSITION, keep the remaining bytes
1952 unchanged. */
1953 p++;
1954 if (*p == 255)
1955 error ("Can't compose a rule-based composition character");
1956 ptemp = p;
1957 while (! CHAR_HEAD_P (*p)) p++;
1958 if (str_cmpchar_id (ptemp - 1, p - ptemp + 1) < 0)
1959 error ("Can't compose an invalid composition character");
1960 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1961 error ("Too long string to be composed: %s", XSTRING (str)->data);
1962 bcopy (ptemp, buf + i, p - ptemp);
1963 i += p - ptemp;
1964 }
1965 else /* multibyte char */
1966 {
1967 /* Add 0x20 to the base leading-code, keep the remaining
1968 bytes unchanged. */
1969 int c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
1970
1971 if (len <= 1 || ! CHAR_VALID_P (c, 0))
1972 error ("Can't compose an invalid character");
1973 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1974 error ("Too long string to be composed: %s", XSTRING (str)->data);
1975 bcopy (p, buf + i, len);
1976 buf[i] += 0x20;
1977 p += len, i += len;
1978 }
1979 }
1980
1981 if (i < 5)
1982 /* STR contains only one character, which can't be composed. */
1983 error ("Too short string to be composed: %s", XSTRING (str)->data);
1984
1985 return make_string_from_bytes (buf, 1, i);
1986 }
1987
1988 \f
1989 int
1990 charset_id_internal (charset_name)
1991 char *charset_name;
1992 {
1993 Lisp_Object val;
1994
1995 val= Fget (intern (charset_name), Qcharset);
1996 if (!VECTORP (val))
1997 error ("Charset %s is not defined", charset_name);
1998
1999 return (XINT (XVECTOR (val)->contents[0]));
2000 }
2001
2002 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
2003 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
2004 ()
2005 {
2006 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
2007 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
2008 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
2009 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
2010 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
2011 charset_big5_1 = charset_id_internal ("chinese-big5-1");
2012 charset_big5_2 = charset_id_internal ("chinese-big5-2");
2013 return Qnil;
2014 }
2015
2016 void
2017 init_charset_once ()
2018 {
2019 int i, j, k;
2020
2021 staticpro (&Vcharset_table);
2022 staticpro (&Vcharset_symbol_table);
2023 staticpro (&Vgeneric_character_list);
2024
2025 /* This has to be done here, before we call Fmake_char_table. */
2026 Qcharset_table = intern ("charset-table");
2027 staticpro (&Qcharset_table);
2028
2029 /* Intern this now in case it isn't already done.
2030 Setting this variable twice is harmless.
2031 But don't staticpro it here--that is done in alloc.c. */
2032 Qchar_table_extra_slots = intern ("char-table-extra-slots");
2033
2034 /* Now we are ready to set up this property, so we can
2035 create the charset table. */
2036 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
2037 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
2038
2039 Qunknown = intern ("unknown");
2040 staticpro (&Qunknown);
2041 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
2042 Qunknown);
2043
2044 /* Setup tables. */
2045 for (i = 0; i < 2; i++)
2046 for (j = 0; j < 2; j++)
2047 for (k = 0; k < 128; k++)
2048 iso_charset_table [i][j][k] = -1;
2049
2050 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
2051 cmpchar_table_size = n_cmpchars = 0;
2052
2053 for (i = 0; i < 256; i++)
2054 BYTES_BY_CHAR_HEAD (i) = 1;
2055 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
2056 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
2057 BYTES_BY_CHAR_HEAD (i) = 2;
2058 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
2059 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
2060 BYTES_BY_CHAR_HEAD (i) = 3;
2061 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
2062 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
2063 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
2064 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
2065 /* The followings don't reflect the actual bytes, but just to tell
2066 that it is a start of a multibyte character. */
2067 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
2068 BYTES_BY_CHAR_HEAD (0x9E) = 2;
2069 BYTES_BY_CHAR_HEAD (0x9F) = 2;
2070
2071 for (i = 0; i < 128; i++)
2072 WIDTH_BY_CHAR_HEAD (i) = 1;
2073 for (; i < 256; i++)
2074 WIDTH_BY_CHAR_HEAD (i) = 4;
2075 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
2076 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
2077 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
2078 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
2079
2080 {
2081 Lisp_Object val;
2082
2083 val = Qnil;
2084 for (i = 0x81; i < 0x90; i++)
2085 val = Fcons (make_number ((i - 0x70) << 7), val);
2086 for (; i < 0x9A; i++)
2087 val = Fcons (make_number ((i - 0x8F) << 14), val);
2088 for (i = 0xA0; i < 0xF0; i++)
2089 val = Fcons (make_number ((i - 0x70) << 7), val);
2090 for (; i < 0xFF; i++)
2091 val = Fcons (make_number ((i - 0xE0) << 14), val);
2092 val = Fcons (make_number (GENERIC_COMPOSITION_CHAR), val);
2093 Vgeneric_character_list = Fnreverse (val);
2094 }
2095
2096 nonascii_insert_offset = 0;
2097 Vnonascii_translation_table = Qnil;
2098 }
2099
2100 #ifdef emacs
2101
2102 void
2103 syms_of_charset ()
2104 {
2105 Qascii = intern ("ascii");
2106 staticpro (&Qascii);
2107
2108 Qcharset = intern ("charset");
2109 staticpro (&Qcharset);
2110
2111 /* Define ASCII charset now. */
2112 update_charset_table (make_number (CHARSET_ASCII),
2113 make_number (1), make_number (94),
2114 make_number (1),
2115 make_number (0),
2116 make_number ('B'),
2117 make_number (0),
2118 build_string ("ASCII"),
2119 build_string ("ASCII"),
2120 build_string ("ASCII (ISO646 IRV)"));
2121 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
2122 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
2123
2124 Qcomposition = intern ("composition");
2125 staticpro (&Qcomposition);
2126 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
2127
2128 Qauto_fill_chars = intern ("auto-fill-chars");
2129 staticpro (&Qauto_fill_chars);
2130 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
2131
2132 defsubr (&Sdefine_charset);
2133 defsubr (&Sgeneric_character_list);
2134 defsubr (&Sget_unused_iso_final_char);
2135 defsubr (&Sdeclare_equiv_charset);
2136 defsubr (&Sfind_charset_region);
2137 defsubr (&Sfind_charset_string);
2138 defsubr (&Smake_char_internal);
2139 defsubr (&Ssplit_char);
2140 defsubr (&Schar_charset);
2141 defsubr (&Scharset_after);
2142 defsubr (&Siso_charset);
2143 defsubr (&Schar_valid_p);
2144 defsubr (&Sunibyte_char_to_multibyte);
2145 defsubr (&Smultibyte_char_to_unibyte);
2146 defsubr (&Schar_bytes);
2147 defsubr (&Schar_width);
2148 defsubr (&Sstring_width);
2149 defsubr (&Schar_direction);
2150 defsubr (&Schars_in_region);
2151 defsubr (&Sstring);
2152 defsubr (&Scmpcharp);
2153 defsubr (&Scmpchar_component);
2154 defsubr (&Scmpchar_cmp_rule);
2155 defsubr (&Scmpchar_cmp_rule_p);
2156 defsubr (&Scmpchar_cmp_count);
2157 defsubr (&Scompose_string);
2158 defsubr (&Ssetup_special_charsets);
2159
2160 DEFVAR_LISP ("charset-list", &Vcharset_list,
2161 "List of charsets ever defined.");
2162 Vcharset_list = Fcons (Qascii, Qnil);
2163
2164 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
2165 "Vector of cons cell of a symbol and translation table ever defined.\n\
2166 An ID of a translation table is an index of this vector.");
2167 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
2168
2169 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
2170 "Leading-code of composite characters.");
2171 leading_code_composition = LEADING_CODE_COMPOSITION;
2172
2173 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
2174 "Leading-code of private TYPE9N charset of column-width 1.");
2175 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
2176
2177 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
2178 "Leading-code of private TYPE9N charset of column-width 2.");
2179 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
2180
2181 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
2182 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
2183 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
2184
2185 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
2186 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
2187 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
2188
2189 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
2190 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
2191 This is used for converting unibyte text to multibyte,\n\
2192 and for inserting character codes specified by number.\n\n\
2193 This serves to convert a Latin-1 or similar 8-bit character code\n\
2194 to the corresponding Emacs multibyte character code.\n\
2195 Typically the value should be (- (make-char CHARSET 0) 128),\n\
2196 for your choice of character set.\n\
2197 If `nonascii-translation-table' is non-nil, it overrides this variable.");
2198 nonascii_insert_offset = 0;
2199
2200 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
2201 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
2202 This is used for converting unibyte text to multibyte,\n\
2203 and for inserting character codes specified by number.\n\n\
2204 Conversion is performed only when multibyte characters are enabled,\n\
2205 and it serves to convert a Latin-1 or similar 8-bit character code\n\
2206 to the corresponding Emacs character code.\n\n\
2207 If this is nil, `nonascii-insert-offset' is used instead.\n\
2208 See also the docstring of `make-translation-table'.");
2209 Vnonascii_translation_table = Qnil;
2210
2211 DEFVAR_INT ("min-composite-char", &min_composite_char,
2212 "Minimum character code of a composite character.");
2213 min_composite_char = MIN_CHAR_COMPOSITION;
2214
2215 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
2216 "A char-table for characters which invoke auto-filling.\n\
2217 Such characters has value t in this table.");
2218 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
2219 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
2220 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
2221 }
2222
2223 #endif /* emacs */