]> code.delx.au - gnu-emacs/blob - src/charset.c
Lots of comments fixed.
[gnu-emacs] / src / charset.c
1 /* Basic multilingual character support.
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs; see the file COPYING. If not, write to
19 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 Boston, MA 02111-1307, USA. */
21
22 /* At first, see the document in `charset.h' to understand the code in
23 this file. */
24
25 #include <stdio.h>
26
27 #ifdef emacs
28
29 #include <sys/types.h>
30 #include <config.h>
31 #include "lisp.h"
32 #include "buffer.h"
33 #include "charset.h"
34 #include "coding.h"
35 #include "disptab.h"
36
37 #else /* not emacs */
38
39 #include "mulelib.h"
40
41 #endif /* emacs */
42
43 Lisp_Object Qcharset, Qascii, Qcomposition;
44 Lisp_Object Qunknown;
45
46 /* Declaration of special leading-codes. */
47 int leading_code_composition; /* for composite characters */
48 int leading_code_private_11; /* for private DIMENSION1 of 1-column */
49 int leading_code_private_12; /* for private DIMENSION1 of 2-column */
50 int leading_code_private_21; /* for private DIMENSION2 of 1-column */
51 int leading_code_private_22; /* for private DIMENSION2 of 2-column */
52
53 /* Declaration of special charsets. */
54 int charset_ascii; /* ASCII */
55 int charset_composition; /* for a composite character */
56 int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
57 int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
58 int charset_jisx0208; /* JISX0208.1983 (Japanese Kanji) */
59 int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
60 int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
61 int charset_big5_1; /* Big5 Level 1 (Chinese Traditional) */
62 int charset_big5_2; /* Big5 Level 2 (Chinese Traditional) */
63
64 int min_composite_char;
65
66 Lisp_Object Qcharset_table;
67
68 /* A char-table containing information of each character set. */
69 Lisp_Object Vcharset_table;
70
71 /* A vector of charset symbol indexed by charset-id. This is used
72 only for returning charset symbol from C functions. */
73 Lisp_Object Vcharset_symbol_table;
74
75 /* A list of charset symbols ever defined. */
76 Lisp_Object Vcharset_list;
77
78 /* Vector of translation table ever defined.
79 ID of a translation table is used to index this vector. */
80 Lisp_Object Vtranslation_table_vector;
81
82 /* A char-table for characters which may invoke auto-filling. */
83 Lisp_Object Vauto_fill_chars;
84
85 Lisp_Object Qauto_fill_chars;
86
87 /* Tables used by macros BYTES_BY_CHAR_HEAD and WIDTH_BY_CHAR_HEAD. */
88 int bytes_by_char_head[256];
89 int width_by_char_head[256];
90
91 /* Mapping table from ISO2022's charset (specified by DIMENSION,
92 CHARS, and FINAL-CHAR) to Emacs' charset. */
93 int iso_charset_table[2][2][128];
94
95 /* Table of pointers to the structure `cmpchar_info' indexed by
96 CMPCHAR-ID. */
97 struct cmpchar_info **cmpchar_table;
98 /* The current size of `cmpchar_table'. */
99 static int cmpchar_table_size;
100 /* Number of the current composite characters. */
101 int n_cmpchars;
102
103 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
104 unsigned char *_fetch_multibyte_char_p;
105 int _fetch_multibyte_char_len;
106
107 /* Offset to add to a non-ASCII value when inserting it. */
108 int nonascii_insert_offset;
109
110 /* Translation table for converting non-ASCII unibyte characters
111 to multibyte codes, or nil. */
112 Lisp_Object Vnonascii_translation_table;
113
114 /* List of all possible generic characters. */
115 Lisp_Object Vgeneric_character_list;
116
117 #define min(X, Y) ((X) < (Y) ? (X) : (Y))
118 #define max(X, Y) ((X) > (Y) ? (X) : (Y))
119 \f
120 void
121 invalid_character (c)
122 int c;
123 {
124 error ("Invalid character: 0%o, %d, 0x%x", c, c, c);
125 }
126
127 /* Parse composite character string STR of length LENGTH (>= 2) and
128 set BYTES, CHARSET, C1, and C2 as below.
129
130 It is assumed that *STR is LEADING_CODE_COMPOSITION and the
131 following (LENGTH - 1) bytes satisfy !CHAR_HEAD_P.
132
133 If there is a valid composite character, set CHARSET, C1, and C2 to
134 such values that MAKE_CHAR can make the composite character from
135 them. Otherwise, set CHARSET to CHARSET_COMPOSITION, set C1 to the
136 second byte of the sequence, C2 to -1 so that MAKE_CHAR can make
137 the invalid multibyte character whose string representation is two
138 bytes of STR[0] and STR[1]. In any case, set BYTES to LENGTH.
139
140 This macro should be called only from SPLIT_MULTIBYTE_SEQ. */
141
142 #define SPLIT_COMPOSITE_SEQ(str, length, bytes, charset, c1, c2) \
143 do { \
144 int cmpchar_id = str_cmpchar_id ((str), (length)); \
145 \
146 (charset) = CHARSET_COMPOSITION; \
147 (bytes) = (length); \
148 if (cmpchar_id >= 0) \
149 { \
150 (c1) = CHAR_FIELD2 (cmpchar_id); \
151 (c2) = CHAR_FIELD3 (cmpchar_id); \
152 } \
153 else \
154 { \
155 (c1) = (str)[1] & 0x7F; \
156 (c2) = -1; \
157 } \
158 } while (0)
159
160 /* Parse non-composite multibyte character string STR of length LENGTH
161 (>= 2) and set BYTES to the length of actual multibyte sequence,
162 CHARSET, C1, and C2 to such values that MAKE_CHAR can make the
163 multibyte character from them.
164
165 It is assumed that *STR is one of base leading codes (excluding
166 LEADING_CODE_COMPOSITION) and the following (LENGTH - 1) bytes
167 satisfy !CHAR_HEAD_P.
168
169 This macro should be called only from SPLIT_MULTIBYTE_SEQ. */
170
171 #define SPLIT_CHARACTER_SEQ(str, length, bytes, charset, c1, c2) \
172 do { \
173 (bytes) = 1; \
174 (charset) = (str)[0]; \
175 if ((charset) >= LEADING_CODE_PRIVATE_11 \
176 && (charset) <= LEADING_CODE_PRIVATE_22) \
177 (charset) = (str)[(bytes)++]; \
178 if ((bytes) < (length)) \
179 { \
180 (c1) = (str)[(bytes)++] & 0x7F; \
181 if ((bytes) < (length)) \
182 (c2) = (str)[(bytes)++] & 0x7F; \
183 else \
184 (c2) = -1; \
185 } \
186 else \
187 (c1) = (c2) = -1; \
188 } while (0)
189
190 /* Parse string STR of length LENGTH and check if a multibyte
191 characters is at STR. set BYTES to the actual length, CHARSET, C1,
192 C2 to proper values for that character. */
193
194 #define SPLIT_MULTIBYTE_SEQ(str, length, bytes, charset, c1, c2) \
195 do { \
196 int i; \
197 if (ASCII_BYTE_P ((str)[0])) \
198 i = 1; \
199 else \
200 for (i = 1; i < (length) && ! CHAR_HEAD_P ((str)[i]); i++); \
201 if (i == 1) \
202 (bytes) = 1, (charset) = CHARSET_ASCII, (c1) = (str)[0] ; \
203 else if ((str)[0] == LEADING_CODE_COMPOSITION) \
204 SPLIT_COMPOSITE_SEQ (str, i, bytes, charset, c1, c2); \
205 else \
206 { \
207 if (i > BYTES_BY_CHAR_HEAD ((str)[0])) \
208 i = BYTES_BY_CHAR_HEAD ((str)[0]); \
209 SPLIT_CHARACTER_SEQ (str, i, bytes, charset, c1, c2); \
210 } \
211 } while (0)
212
213 /* 1 if CHARSET, C1, and C2 compose a valid character, else 0. */
214 #define CHAR_COMPONENTS_VALID_P(charset, c1, c2) \
215 (CHARSET_DIMENSION (charset) == 1 \
216 ? ((c1) >= 0x20 && (c1) <= 0x7F) \
217 : ((c1) >= 0x20 && (c1) <= 0x7F && (c2) >= 0x20 && (c2) <= 0x7F))
218
219 /* Set STR a pointer to the multi-byte form of the character C. If C
220 is not a composite character, the multi-byte form is set in WORKBUF
221 and STR points WORKBUF. The caller should allocate at least 4-byte
222 area at WORKBUF in advance. Returns the length of the multi-byte
223 form. If C is an invalid character, store (C & 0xFF) in WORKBUF[0]
224 and return 1.
225
226 Use macro `CHAR_STRING (C, WORKBUF, STR)' instead of calling this
227 function directly if C can be an ASCII character. */
228
229 int
230 non_ascii_char_to_string (c, workbuf, str)
231 int c;
232 unsigned char *workbuf, **str;
233 {
234 if (c & CHAR_MODIFIER_MASK) /* This includes the case C is negative. */
235 {
236 /* Multibyte character can't have a modifier bit. */
237 if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
238 invalid_character (c);
239
240 /* For Meta, Shift, and Control modifiers, we need special care. */
241 if (c & CHAR_META)
242 {
243 /* Move the meta bit to the right place for a string. */
244 c = (c & ~CHAR_META) | 0x80;
245 }
246 if (c & CHAR_SHIFT)
247 {
248 /* Shift modifier is valid only with [A-Za-z]. */
249 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
250 c &= ~CHAR_SHIFT;
251 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
252 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
253 }
254 if (c & CHAR_CTL)
255 {
256 /* Simulate the code in lread.c. */
257 /* Allow `\C- ' and `\C-?'. */
258 if (c == (CHAR_CTL | ' '))
259 c = 0;
260 else if (c == (CHAR_CTL | '?'))
261 c = 127;
262 /* ASCII control chars are made from letters (both cases),
263 as well as the non-letters within 0100...0137. */
264 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
265 c &= (037 | (~0177 & ~CHAR_CTL));
266 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
267 c &= (037 | (~0177 & ~CHAR_CTL));
268 }
269
270 /* If C still has any modifier bits, it is an invalid character. */
271 if (c & CHAR_MODIFIER_MASK)
272 invalid_character (c);
273
274 *str = workbuf;
275 *workbuf++ = c;
276 }
277 else
278 {
279 int charset, c1, c2;
280
281 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
282 if (charset == CHARSET_COMPOSITION)
283 {
284 if (c >= MAX_CHAR)
285 invalid_character (c);
286 if (c >= MIN_CHAR_COMPOSITION)
287 {
288 /* Valid composite character. */
289 *str = cmpchar_table[COMPOSITE_CHAR_ID (c)]->data;
290 workbuf = *str + cmpchar_table[COMPOSITE_CHAR_ID (c)]->len;
291 }
292 else
293 {
294 /* Invalid but can have multibyte form. */
295 *str = workbuf;
296 *workbuf++ = LEADING_CODE_COMPOSITION;
297 *workbuf++ = c1 | 0x80;
298 }
299 }
300 else if (charset > CHARSET_COMPOSITION)
301 {
302 *str = workbuf;
303 if (charset >= LEADING_CODE_EXT_11)
304 *workbuf++ = (charset < LEADING_CODE_EXT_12
305 ? LEADING_CODE_PRIVATE_11
306 : (charset < LEADING_CODE_EXT_21
307 ? LEADING_CODE_PRIVATE_12
308 : (charset < LEADING_CODE_EXT_22
309 ? LEADING_CODE_PRIVATE_21
310 : LEADING_CODE_PRIVATE_22)));
311 *workbuf++ = charset;
312 if (c1 > 0 && c1 < 32 || c2 > 0 && c2 < 32)
313 invalid_character (c);
314 if (c1)
315 {
316 *workbuf++ = c1 | 0x80;
317 if (c2 > 0)
318 *workbuf++ = c2 | 0x80;
319 }
320 }
321 else if (charset == CHARSET_ASCII)
322 *workbuf++= c & 0x7F;
323 else
324 invalid_character (c);
325 }
326
327 return (workbuf - *str);
328 }
329
330 /* Return the non-ASCII character corresponding to multi-byte form at
331 STR of length LEN. If ACTUAL_LEN is not NULL, store the byte
332 length of the multibyte form in *ACTUAL_LEN.
333
334 Use macro `STRING_CHAR (STR, LEN)' instead of calling this function
335 directly if you want ot handle ASCII characters as well. */
336
337 int
338 string_to_non_ascii_char (str, len, actual_len)
339 const unsigned char *str;
340 int len, *actual_len;
341 {
342 int c, bytes, charset, c1, c2;
343
344 SPLIT_MULTIBYTE_SEQ (str, len, bytes, charset, c1, c2);
345 c = MAKE_CHAR (charset, c1, c2);
346 if (actual_len)
347 *actual_len = bytes;
348 return c;
349 }
350
351 /* Return the length of the multi-byte form at string STR of length LEN.
352 Use the macro MULTIBYTE_FORM_LENGTH instead. */
353 int
354 multibyte_form_length (str, len)
355 const unsigned char *str;
356 int len;
357 {
358 int bytes;
359
360 PARSE_MULTIBYTE_SEQ (str, len, bytes);
361 return bytes;
362 }
363
364 /* Check multibyte form at string STR of length LEN and set variables
365 pointed by CHARSET, C1, and C2 to charset and position codes of the
366 character at STR, and return 0. If there's no multibyte character,
367 return -1. This should be used only in the macro SPLIT_STRING
368 which checks range of STR in advance. */
369
370 int
371 split_non_ascii_string (str, len, charset, c1, c2)
372 const unsigned char *str;
373 unsigned char *c1, *c2;
374 int len, *charset;
375 {
376 register int bytes, cs, code1, code2 = -1;
377
378 SPLIT_MULTIBYTE_SEQ (str, len, bytes, cs, code1, code2);
379 if (cs == CHARSET_ASCII)
380 return -1;
381 *charset = cs;
382 *c1 = code1;
383 *c2 = code2;
384 }
385
386 /* Return 1 iff character C has valid printable glyph.
387 Use the macro CHAR_PRINTABLE_P instead. */
388 int
389 char_printable_p (c)
390 int c;
391 {
392 int charset, c1, c2, chars;
393
394 if (SINGLE_BYTE_CHAR_P (c))
395 return 1;
396 if (c >= MIN_CHAR_COMPOSITION)
397 return (c < MAX_CHAR);
398
399 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
400 if (! CHARSET_DEFINED_P (charset))
401 return 0;
402 if (CHARSET_CHARS (charset) == 94
403 ? c1 <= 32 || c1 >= 127
404 : c1 < 32)
405 return 0;
406 if (CHARSET_DIMENSION (charset) == 2
407 && (CHARSET_CHARS (charset) == 94
408 ? c2 <= 32 || c2 >= 127
409 : c2 < 32))
410 return 0;
411 return 1;
412 }
413
414 /* Translate character C by translation table TABLE. If C
415 is negative, translate a character specified by CHARSET, C1, and C2
416 (C1 and C2 are code points of the character). If no translation is
417 found in TABLE, return C. */
418 int
419 translate_char (table, c, charset, c1, c2)
420 Lisp_Object table;
421 int c, charset, c1, c2;
422 {
423 Lisp_Object ch;
424 int alt_charset, alt_c1, alt_c2, dimension;
425
426 if (c < 0) c = MAKE_CHAR (charset, c1, c2);
427 if (!CHAR_TABLE_P (table)
428 || (ch = Faref (table, make_number (c)), !NATNUMP (ch)))
429 return c;
430
431 SPLIT_CHAR (XFASTINT (ch), alt_charset, alt_c1, alt_c2);
432 dimension = CHARSET_DIMENSION (alt_charset);
433 if (dimension == 1 && alt_c1 > 0 || dimension == 2 && alt_c2 > 0)
434 /* CH is not a generic character, just return it. */
435 return XFASTINT (ch);
436
437 /* Since CH is a generic character, we must return a specific
438 charater which has the same position codes as C from CH. */
439 if (charset < 0)
440 SPLIT_CHAR (c, charset, c1, c2);
441 if (dimension != CHARSET_DIMENSION (charset))
442 /* We can't make such a character because of dimension mismatch. */
443 return c;
444 return MAKE_CHAR (alt_charset, c1, c2);
445 }
446
447 /* Convert the unibyte character C to multibyte based on
448 Vnonascii_translation_table or nonascii_insert_offset. If they can't
449 convert C to a valid multibyte character, convert it based on
450 DEFAULT_NONASCII_INSERT_OFFSET which makes C a Latin-1 character. */
451
452 int
453 unibyte_char_to_multibyte (c)
454 int c;
455 {
456 if (c < 0400 && c >= 0200)
457 {
458 int c_save = c;
459
460 if (! NILP (Vnonascii_translation_table))
461 {
462 c = XINT (Faref (Vnonascii_translation_table, make_number (c)));
463 if (c >= 0400 && ! char_valid_p (c, 0))
464 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
465 }
466 else if (c >= 0240 && nonascii_insert_offset > 0)
467 {
468 c += nonascii_insert_offset;
469 if (c < 0400 || ! char_valid_p (c, 0))
470 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
471 }
472 else if (c >= 0240)
473 c = c_save + DEFAULT_NONASCII_INSERT_OFFSET;
474 }
475 return c;
476 }
477
478
479 /* Convert the multibyte character C to unibyte 8-bit character based
480 on Vnonascii_translation_table or nonascii_insert_offset. If
481 REV_TBL is non-nil, it should be a reverse table of
482 Vnonascii_translation_table, i.e. what given by:
483 Fchar_table_extra_slot (Vnonascii_translation_table, make_number (0)) */
484
485 int
486 multibyte_char_to_unibyte (c, rev_tbl)
487 int c;
488 Lisp_Object rev_tbl;
489 {
490 if (!SINGLE_BYTE_CHAR_P (c))
491 {
492 int c_save = c;
493
494 if (! CHAR_TABLE_P (rev_tbl)
495 && CHAR_TABLE_P (Vnonascii_translation_table))
496 rev_tbl = Fchar_table_extra_slot (Vnonascii_translation_table,
497 make_number (0));
498 if (CHAR_TABLE_P (rev_tbl))
499 {
500 Lisp_Object temp;
501 temp = Faref (rev_tbl, make_number (c));
502 if (INTEGERP (temp))
503 c = XINT (temp);
504 if (c >= 256)
505 c = (c_save & 0177) + 0200;
506 }
507 else
508 {
509 if (nonascii_insert_offset > 0)
510 c -= nonascii_insert_offset;
511 if (c < 128 || c >= 256)
512 c = (c_save & 0177) + 0200;
513 }
514 }
515
516 return c;
517 }
518
519 \f
520 /* Update the table Vcharset_table with the given arguments (see the
521 document of `define-charset' for the meaning of each argument).
522 Several other table contents are also updated. The caller should
523 check the validity of CHARSET-ID and the remaining arguments in
524 advance. */
525
526 void
527 update_charset_table (charset_id, dimension, chars, width, direction,
528 iso_final_char, iso_graphic_plane,
529 short_name, long_name, description)
530 Lisp_Object charset_id, dimension, chars, width, direction;
531 Lisp_Object iso_final_char, iso_graphic_plane;
532 Lisp_Object short_name, long_name, description;
533 {
534 int charset = XINT (charset_id);
535 int bytes;
536 unsigned char leading_code_base, leading_code_ext;
537
538 if (NILP (CHARSET_TABLE_ENTRY (charset)))
539 CHARSET_TABLE_ENTRY (charset)
540 = Fmake_vector (make_number (CHARSET_MAX_IDX), Qnil);
541
542 /* Get byte length of multibyte form, base leading-code, and
543 extended leading-code of the charset. See the comment under the
544 title "GENERAL NOTE on CHARACTER SET (CHARSET)" in charset.h. */
545 bytes = XINT (dimension);
546 if (charset < MIN_CHARSET_PRIVATE_DIMENSION1)
547 {
548 /* Official charset, it doesn't have an extended leading-code. */
549 if (charset != CHARSET_ASCII)
550 bytes += 1; /* For a base leading-code. */
551 leading_code_base = charset;
552 leading_code_ext = 0;
553 }
554 else
555 {
556 /* Private charset. */
557 bytes += 2; /* For base and extended leading-codes. */
558 leading_code_base
559 = (charset < LEADING_CODE_EXT_12
560 ? LEADING_CODE_PRIVATE_11
561 : (charset < LEADING_CODE_EXT_21
562 ? LEADING_CODE_PRIVATE_12
563 : (charset < LEADING_CODE_EXT_22
564 ? LEADING_CODE_PRIVATE_21
565 : LEADING_CODE_PRIVATE_22)));
566 leading_code_ext = charset;
567 }
568
569 if (BYTES_BY_CHAR_HEAD (leading_code_base) != bytes)
570 error ("Invalid dimension for the charset-ID %d", charset);
571
572 CHARSET_TABLE_INFO (charset, CHARSET_ID_IDX) = charset_id;
573 CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX) = make_number (bytes);
574 CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX) = dimension;
575 CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX) = chars;
576 CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX) = width;
577 CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX) = direction;
578 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX)
579 = make_number (leading_code_base);
580 CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX)
581 = make_number (leading_code_ext);
582 CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX) = iso_final_char;
583 CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX)
584 = iso_graphic_plane;
585 CHARSET_TABLE_INFO (charset, CHARSET_SHORT_NAME_IDX) = short_name;
586 CHARSET_TABLE_INFO (charset, CHARSET_LONG_NAME_IDX) = long_name;
587 CHARSET_TABLE_INFO (charset, CHARSET_DESCRIPTION_IDX) = description;
588 CHARSET_TABLE_INFO (charset, CHARSET_PLIST_IDX) = Qnil;
589
590 {
591 /* If we have already defined a charset which has the same
592 DIMENSION, CHARS and ISO-FINAL-CHAR but the different
593 DIRECTION, we must update the entry REVERSE-CHARSET of both
594 charsets. If there's no such charset, the value of the entry
595 is set to nil. */
596 int i;
597
598 for (i = 0; i <= MAX_CHARSET; i++)
599 if (!NILP (CHARSET_TABLE_ENTRY (i)))
600 {
601 if (CHARSET_DIMENSION (i) == XINT (dimension)
602 && CHARSET_CHARS (i) == XINT (chars)
603 && CHARSET_ISO_FINAL_CHAR (i) == XINT (iso_final_char)
604 && CHARSET_DIRECTION (i) != XINT (direction))
605 {
606 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
607 = make_number (i);
608 CHARSET_TABLE_INFO (i, CHARSET_REVERSE_CHARSET_IDX) = charset_id;
609 break;
610 }
611 }
612 if (i > MAX_CHARSET)
613 /* No such a charset. */
614 CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX)
615 = make_number (-1);
616 }
617
618 if (charset != CHARSET_ASCII
619 && charset < MIN_CHARSET_PRIVATE_DIMENSION1)
620 {
621 width_by_char_head[leading_code_base] = XINT (width);
622
623 /* Update table emacs_code_class. */
624 emacs_code_class[charset] = (bytes == 2
625 ? EMACS_leading_code_2
626 : (bytes == 3
627 ? EMACS_leading_code_3
628 : EMACS_leading_code_4));
629 }
630
631 /* Update table iso_charset_table. */
632 if (ISO_CHARSET_TABLE (dimension, chars, iso_final_char) < 0)
633 ISO_CHARSET_TABLE (dimension, chars, iso_final_char) = charset;
634 }
635
636 #ifdef emacs
637
638 /* Return charset id of CHARSET_SYMBOL, or return -1 if CHARSET_SYMBOL
639 is invalid. */
640 int
641 get_charset_id (charset_symbol)
642 Lisp_Object charset_symbol;
643 {
644 Lisp_Object val;
645 int charset;
646
647 return ((SYMBOLP (charset_symbol)
648 && (val = Fget (charset_symbol, Qcharset), VECTORP (val))
649 && (charset = XINT (XVECTOR (val)->contents[CHARSET_ID_IDX]),
650 CHARSET_VALID_P (charset)))
651 ? charset : -1);
652 }
653
654 /* Return an identification number for a new private charset of
655 DIMENSION and WIDTH. If there's no more room for the new charset,
656 return 0. */
657 Lisp_Object
658 get_new_private_charset_id (dimension, width)
659 int dimension, width;
660 {
661 int charset, from, to;
662
663 if (dimension == 1)
664 {
665 if (width == 1)
666 from = LEADING_CODE_EXT_11, to = LEADING_CODE_EXT_12;
667 else
668 from = LEADING_CODE_EXT_12, to = LEADING_CODE_EXT_21;
669 }
670 else
671 {
672 if (width == 1)
673 from = LEADING_CODE_EXT_21, to = LEADING_CODE_EXT_22;
674 else
675 from = LEADING_CODE_EXT_22, to = LEADING_CODE_EXT_MAX + 1;
676 }
677
678 for (charset = from; charset < to; charset++)
679 if (!CHARSET_DEFINED_P (charset)) break;
680
681 return make_number (charset < to ? charset : 0);
682 }
683
684 DEFUN ("define-charset", Fdefine_charset, Sdefine_charset, 3, 3, 0,
685 "Define CHARSET-ID as the identification number of CHARSET with INFO-VECTOR.\n\
686 If CHARSET-ID is nil, it is decided automatically, which means CHARSET is\n\
687 treated as a private charset.\n\
688 INFO-VECTOR is a vector of the format:\n\
689 [DIMENSION CHARS WIDTH DIRECTION ISO-FINAL-CHAR ISO-GRAPHIC-PLANE\n\
690 SHORT-NAME LONG-NAME DESCRIPTION]\n\
691 The meanings of each elements is as follows:\n\
692 DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.\n\
693 CHARS (integer) is the number of characters in a dimension: 94 or 96.\n\
694 WIDTH (integer) is the number of columns a character in the charset\n\
695 occupies on the screen: one of 0, 1, and 2.\n\
696 \n\
697 DIRECTION (integer) is the rendering direction of characters in the\n\
698 charset when rendering. If 0, render from left to right, else\n\
699 render from right to left.\n\
700 \n\
701 ISO-FINAL-CHAR (character) is the final character of the\n\
702 corresponding ISO 2022 charset.\n\
703 \n\
704 ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked\n\
705 while encoding to variants of ISO 2022 coding system, one of the\n\
706 following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).\n\
707 \n\
708 SHORT-NAME (string) is the short name to refer to the charset.\n\
709 \n\
710 LONG-NAME (string) is the long name to refer to the charset.\n\
711 \n\
712 DESCRIPTION (string) is the description string of the charset.")
713 (charset_id, charset_symbol, info_vector)
714 Lisp_Object charset_id, charset_symbol, info_vector;
715 {
716 Lisp_Object *vec;
717
718 if (!NILP (charset_id))
719 CHECK_NUMBER (charset_id, 0);
720 CHECK_SYMBOL (charset_symbol, 1);
721 CHECK_VECTOR (info_vector, 2);
722
723 if (! NILP (charset_id))
724 {
725 if (! CHARSET_VALID_P (XINT (charset_id)))
726 error ("Invalid CHARSET: %d", XINT (charset_id));
727 else if (CHARSET_DEFINED_P (XINT (charset_id)))
728 error ("Already defined charset: %d", XINT (charset_id));
729 }
730
731 vec = XVECTOR (info_vector)->contents;
732 if (XVECTOR (info_vector)->size != 9
733 || !INTEGERP (vec[0]) || !(XINT (vec[0]) == 1 || XINT (vec[0]) == 2)
734 || !INTEGERP (vec[1]) || !(XINT (vec[1]) == 94 || XINT (vec[1]) == 96)
735 || !INTEGERP (vec[2]) || !(XINT (vec[2]) == 1 || XINT (vec[2]) == 2)
736 || !INTEGERP (vec[3]) || !(XINT (vec[3]) == 0 || XINT (vec[3]) == 1)
737 || !INTEGERP (vec[4]) || !(XINT (vec[4]) >= '0' && XINT (vec[4]) <= '~')
738 || !INTEGERP (vec[5]) || !(XINT (vec[5]) == 0 || XINT (vec[5]) == 1)
739 || !STRINGP (vec[6])
740 || !STRINGP (vec[7])
741 || !STRINGP (vec[8]))
742 error ("Invalid info-vector argument for defining charset %s",
743 XSYMBOL (charset_symbol)->name->data);
744
745 if (NILP (charset_id))
746 {
747 charset_id = get_new_private_charset_id (XINT (vec[0]), XINT (vec[2]));
748 if (XINT (charset_id) == 0)
749 error ("There's no room for a new private charset %s",
750 XSYMBOL (charset_symbol)->name->data);
751 }
752
753 update_charset_table (charset_id, vec[0], vec[1], vec[2], vec[3],
754 vec[4], vec[5], vec[6], vec[7], vec[8]);
755 Fput (charset_symbol, Qcharset, CHARSET_TABLE_ENTRY (XINT (charset_id)));
756 CHARSET_SYMBOL (XINT (charset_id)) = charset_symbol;
757 Vcharset_list = Fcons (charset_symbol, Vcharset_list);
758 return Qnil;
759 }
760
761 DEFUN ("generic-character-list", Fgeneric_character_list,
762 Sgeneric_character_list, 0, 0, 0,
763 "Return a list of all possible generic characters.\n\
764 It includes a generic character for a charset not yet defined.")
765 ()
766 {
767 return Vgeneric_character_list;
768 }
769
770 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
771 Sget_unused_iso_final_char, 2, 2, 0,
772 "Return an unsed ISO's final char for a charset of DIMENISION and CHARS.\n\
773 DIMENSION is the number of bytes to represent a character: 1 or 2.\n\
774 CHARS is the number of characters in a dimension: 94 or 96.\n\
775 \n\
776 This final char is for private use, thus the range is `0' (48) .. `?' (63).\n\
777 If there's no unused final char for the specified kind of charset,\n\
778 return nil.")
779 (dimension, chars)
780 Lisp_Object dimension, chars;
781 {
782 int final_char;
783
784 CHECK_NUMBER (dimension, 0);
785 CHECK_NUMBER (chars, 1);
786 if (XINT (dimension) != 1 && XINT (dimension) != 2)
787 error ("Invalid charset dimension %d, it should be 1 or 2",
788 XINT (dimension));
789 if (XINT (chars) != 94 && XINT (chars) != 96)
790 error ("Invalid charset chars %d, it should be 94 or 96",
791 XINT (chars));
792 for (final_char = '0'; final_char <= '?'; final_char++)
793 {
794 if (ISO_CHARSET_TABLE (dimension, chars, make_number (final_char)) < 0)
795 break;
796 }
797 return (final_char <= '?' ? make_number (final_char) : Qnil);
798 }
799
800 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
801 4, 4, 0,
802 "Declare a charset of DIMENSION, CHARS, FINAL-CHAR is the same as CHARSET.\n\
803 CHARSET should be defined by `defined-charset' in advance.")
804 (dimension, chars, final_char, charset_symbol)
805 Lisp_Object dimension, chars, final_char, charset_symbol;
806 {
807 int charset;
808
809 CHECK_NUMBER (dimension, 0);
810 CHECK_NUMBER (chars, 1);
811 CHECK_NUMBER (final_char, 2);
812 CHECK_SYMBOL (charset_symbol, 3);
813
814 if (XINT (dimension) != 1 && XINT (dimension) != 2)
815 error ("Invalid DIMENSION %d, it should be 1 or 2", XINT (dimension));
816 if (XINT (chars) != 94 && XINT (chars) != 96)
817 error ("Invalid CHARS %d, it should be 94 or 96", XINT (chars));
818 if (XINT (final_char) < '0' || XFASTINT (final_char) > '~')
819 error ("Invalid FINAL-CHAR %c, it should be `0'..`~'", XINT (chars));
820 if ((charset = get_charset_id (charset_symbol)) < 0)
821 error ("Invalid charset %s", XSYMBOL (charset_symbol)->name->data);
822
823 ISO_CHARSET_TABLE (dimension, chars, final_char) = charset;
824 return Qnil;
825 }
826
827 /* Return number of different charsets in STR of length LEN. In
828 addition, for each found charset N, CHARSETS[N] is set 1. The
829 caller should allocate CHARSETS (MAX_CHARSET + 1 elements) in advance.
830 It may lookup a translation table TABLE if supplied.
831
832 If CMPCHARP is nonzero and some composite character is found,
833 CHARSETS[128] is also set 1 and the returned number is incremented
834 by 1.
835
836 If MULTIBYTE is zero, do not check multibyte characters, i.e. if
837 any ASCII codes (7-bit) are found, CHARSET[0] is set to 1, if any
838 8-bit codes are found CHARSET[1] is set to 1. */
839
840 int
841 find_charset_in_str (str, len, charsets, table, cmpcharp, multibyte)
842 unsigned char *str;
843 int len, *charsets;
844 Lisp_Object table;
845 int cmpcharp;
846 int multibyte;
847 {
848 register int num = 0, c;
849
850 if (! multibyte)
851 {
852 unsigned char *endp = str + len;
853 int maskbits = 0;
854
855 while (str < endp && maskbits != 3)
856 maskbits |= (*str++ < 0x80 ? 1 : 2);
857 if (maskbits & 1)
858 {
859 charsets[0] = 1;
860 num++;
861 }
862 if (maskbits & 2)
863 {
864 charsets[1] = 1;
865 num++;
866 }
867 return num;
868 }
869
870 if (! CHAR_TABLE_P (table))
871 table = Qnil;
872
873 while (len > 0)
874 {
875 int bytes, charset;
876 c = *str;
877
878 if (c == LEADING_CODE_COMPOSITION)
879 {
880 int cmpchar_id = str_cmpchar_id (str, len);
881 GLYPH *glyph;
882
883 if (cmpchar_id >= 0)
884 {
885 struct cmpchar_info *cmp_p = cmpchar_table[cmpchar_id];
886 int i;
887
888 for (i = 0; i < cmp_p->glyph_len; i++)
889 {
890 c = cmp_p->glyph[i];
891 if (!NILP (table))
892 {
893 if ((c = translate_char (table, c, 0, 0, 0)) < 0)
894 c = cmp_p->glyph[i];
895 }
896 if ((charset = CHAR_CHARSET (c)) < 0)
897 charset = CHARSET_ASCII;
898 if (!charsets[charset])
899 {
900 charsets[charset] = 1;
901 num += 1;
902 }
903 }
904 str += cmp_p->len;
905 len -= cmp_p->len;
906 if (cmpcharp && !charsets[CHARSET_COMPOSITION])
907 {
908 charsets[CHARSET_COMPOSITION] = 1;
909 num += 1;
910 }
911 continue;
912 }
913
914 charset = 1; /* This leads to `unknown' charset. */
915 bytes = 1;
916 }
917 else
918 {
919 c = STRING_CHAR_AND_LENGTH (str, len, bytes);
920 if (! NILP (table))
921 {
922 int c1 = translate_char (table, c, 0, 0, 0);
923 if (c1 >= 0)
924 c = c1;
925 }
926 charset = CHAR_CHARSET (c);
927 }
928
929 if (!charsets[charset])
930 {
931 charsets[charset] = 1;
932 num += 1;
933 }
934 str += bytes;
935 len -= bytes;
936 }
937 return num;
938 }
939
940 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
941 2, 3, 0,
942 "Return a list of charsets in the region between BEG and END.\n\
943 BEG and END are buffer positions.\n\
944 If the region contains any composite character,\n\
945 `composition' is included in the returned list.\n\
946 Optional arg TABLE if non-nil is a translation table to look up.\n\
947 \n\
948 If the region contains invalid multiybte characters,\n\
949 `unknown' is included in the returned list.\n\
950 \n\
951 If the current buffer is unibyte, the returned list contains\n\
952 `ascii' if any 7-bit characters are found,\n\
953 and `unknown' if any 8-bit characters are found.")
954 (beg, end, table)
955 Lisp_Object beg, end, table;
956 {
957 int charsets[MAX_CHARSET + 1];
958 int from, from_byte, to, stop, stop_byte, i;
959 Lisp_Object val;
960 int undefined;
961 int multibyte = !NILP (current_buffer->enable_multibyte_characters);
962
963 validate_region (&beg, &end);
964 from = XFASTINT (beg);
965 stop = to = XFASTINT (end);
966
967 if (from < GPT && GPT < to)
968 {
969 stop = GPT;
970 stop_byte = GPT_BYTE;
971 }
972 else
973 stop_byte = CHAR_TO_BYTE (stop);
974
975 from_byte = CHAR_TO_BYTE (from);
976
977 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
978 while (1)
979 {
980 find_charset_in_str (BYTE_POS_ADDR (from_byte), stop_byte - from_byte,
981 charsets, table, 1, multibyte);
982 if (stop < to)
983 {
984 from = stop, from_byte = stop_byte;
985 stop = to, stop_byte = CHAR_TO_BYTE (stop);
986 }
987 else
988 break;
989 }
990
991 val = Qnil;
992 undefined = 0;
993 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
994 if (charsets[i])
995 {
996 if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
997 val = Fcons (CHARSET_SYMBOL (i), val);
998 else
999 undefined = 1;
1000 }
1001 if (undefined)
1002 val = Fcons (Qunknown, val);
1003 return val;
1004 }
1005
1006 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
1007 1, 2, 0,
1008 "Return a list of charsets in STR.\n\
1009 If the string contains any composite characters,\n\
1010 `composition' is included in the returned list.\n\
1011 Optional arg TABLE if non-nil is a translation table to look up.\n\
1012 \n\
1013 If the region contains invalid multiybte characters,\n\
1014 `unknown' is included in the returned list.\n\
1015 \n\
1016 If STR is unibyte, the returned list contains\n\
1017 `ascii' if any 7-bit characters are found,\n\
1018 and `unknown' if any 8-bit characters are found.")
1019 (str, table)
1020 Lisp_Object str, table;
1021 {
1022 int charsets[MAX_CHARSET + 1];
1023 int i;
1024 Lisp_Object val;
1025 int undefined;
1026 int multibyte;
1027
1028 CHECK_STRING (str, 0);
1029 multibyte = STRING_MULTIBYTE (str);
1030
1031 bzero (charsets, (MAX_CHARSET + 1) * sizeof (int));
1032 find_charset_in_str (XSTRING (str)->data, STRING_BYTES (XSTRING (str)),
1033 charsets, table, 1, multibyte);
1034 val = Qnil;
1035 undefined = 0;
1036 for (i = (multibyte ? MAX_CHARSET : 1); i >= 0; i--)
1037 if (charsets[i])
1038 {
1039 if (CHARSET_DEFINED_P (i) || i == CHARSET_COMPOSITION)
1040 val = Fcons (CHARSET_SYMBOL (i), val);
1041 else
1042 undefined = 1;
1043 }
1044 if (undefined)
1045 val = Fcons (Qunknown, val);
1046 return val;
1047 }
1048 \f
1049 DEFUN ("make-char-internal", Fmake_char_internal, Smake_char_internal, 1, 3, 0,
1050 "")
1051 (charset, code1, code2)
1052 Lisp_Object charset, code1, code2;
1053 {
1054 int charset_id, c1, c2;
1055
1056 CHECK_NUMBER (charset, 0);
1057 charset_id = XINT (charset);
1058 if (!CHARSET_DEFINED_P (charset_id))
1059 error ("Invalid charset ID: %d", XINT (charset));
1060
1061 if (NILP (code1))
1062 c1 = 0;
1063 else
1064 {
1065 CHECK_NUMBER (code1, 1);
1066 c1 = XINT (code1);
1067 }
1068 if (NILP (code2))
1069 c2 = 0;
1070 else
1071 {
1072 CHECK_NUMBER (code2, 2);
1073 c2 = XINT (code2);
1074 }
1075
1076 if (c1 < 0 || c1 > 0xFF || c2 < 0 || c2 > 0xFF)
1077 error ("Invalid code points: %d %d", c1, c2);
1078 c1 &= 0x7F;
1079 c2 &= 0x7F;
1080 if (c1 == 0
1081 ? c2 != 0
1082 : (c2 == 0
1083 ? !CHAR_COMPONENTS_VALID_P (charset, c1, 0x20)
1084 : !CHAR_COMPONENTS_VALID_P (charset, c1, c2)))
1085 error ("Invalid code points: %d %d", c1, c2);
1086
1087 return make_number (MAKE_CHAR (charset_id, c1, c2));
1088 }
1089
1090 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1091 "Return list of charset and one or two position-codes of CHAR.\n\
1092 If CHAR is invalid as a character code,\n\
1093 return a list of symbol `unknown' and CHAR.")
1094 (ch)
1095 Lisp_Object ch;
1096 {
1097 Lisp_Object val;
1098 int c, charset, c1, c2;
1099
1100 CHECK_NUMBER (ch, 0);
1101 c = XFASTINT (ch);
1102 if (!CHAR_VALID_P (c, 1))
1103 return Fcons (Qunknown, Fcons (ch, Qnil));
1104 SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
1105 return (c2 >= 0
1106 ? Fcons (CHARSET_SYMBOL (charset),
1107 Fcons (make_number (c1), Fcons (make_number (c2), Qnil)))
1108 : Fcons (CHARSET_SYMBOL (charset), Fcons (make_number (c1), Qnil)));
1109 }
1110
1111 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 1, 0,
1112 "Return charset of CHAR.")
1113 (ch)
1114 Lisp_Object ch;
1115 {
1116 CHECK_NUMBER (ch, 0);
1117
1118 return CHARSET_SYMBOL (CHAR_CHARSET (XINT (ch)));
1119 }
1120
1121 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
1122 "Return charset of a character in the current buffer at position POS.\n\
1123 If POS is nil, it defauls to the current point.\n\
1124 If POS is out of range, the value is nil.")
1125 (pos)
1126 Lisp_Object pos;
1127 {
1128 register int pos_byte, bytes, charset, c1, c2;
1129 register unsigned char *p;
1130
1131 if (NILP (pos))
1132 pos_byte = PT_BYTE;
1133 else if (MARKERP (pos))
1134 {
1135 pos_byte = marker_byte_position (pos);
1136 if (pos_byte < BEGV_BYTE || pos_byte >= ZV_BYTE)
1137 return Qnil;
1138 }
1139 else
1140 {
1141 CHECK_NUMBER (pos, 0);
1142 if (XINT (pos) < BEGV || XINT (pos) >= ZV)
1143 return Qnil;
1144 pos_byte = CHAR_TO_BYTE (XINT (pos));
1145 }
1146 p = BYTE_POS_ADDR (pos_byte);
1147 if (BASE_LEADING_CODE_P (*p))
1148 {
1149 SPLIT_MULTIBYTE_SEQ (p, Z_BYTE - pos_byte, bytes, charset, c1, c2);
1150 if (charset < 0)
1151 charset = 1;
1152 }
1153 else
1154 charset = CHARSET_ASCII;
1155
1156 return CHARSET_SYMBOL (charset);
1157 }
1158
1159 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
1160 "Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.\n\
1161 \n\
1162 ISO 2022's designation sequence (escape sequence) distinguishes charsets\n\
1163 by their DIMENSION, CHARS, and FINAL-CHAR,\n\
1164 where as Emacs distinguishes them by charset symbol.\n\
1165 See the documentation of the function `charset-info' for the meanings of\n\
1166 DIMENSION, CHARS, and FINAL-CHAR.")
1167 (dimension, chars, final_char)
1168 Lisp_Object dimension, chars, final_char;
1169 {
1170 int charset;
1171
1172 CHECK_NUMBER (dimension, 0);
1173 CHECK_NUMBER (chars, 1);
1174 CHECK_NUMBER (final_char, 2);
1175
1176 if ((charset = ISO_CHARSET_TABLE (dimension, chars, final_char)) < 0)
1177 return Qnil;
1178 return CHARSET_SYMBOL (charset);
1179 }
1180
1181 /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
1182 generic character. If GENERICP is zero, return nonzero iff C is a
1183 valid normal character. Do not call this function directly,
1184 instead use macro CHAR_VALID_P. */
1185 int
1186 char_valid_p (c, genericp)
1187 int c, genericp;
1188 {
1189 int charset, c1, c2;
1190
1191 if (c < 0)
1192 return 0;
1193 if (SINGLE_BYTE_CHAR_P (c))
1194 return 1;
1195 SPLIT_NON_ASCII_CHAR (c, charset, c1, c2);
1196 if (charset == CHARSET_COMPOSITION)
1197 return ((c >= MIN_CHAR_COMPOSITION
1198 && c < MIN_CHAR_COMPOSITION + n_cmpchars)
1199 || (genericp && c == GENERIC_COMPOSITION_CHAR));
1200 if (genericp)
1201 {
1202 if (c1)
1203 {
1204 if (c2 <= 0) c2 = 0x20;
1205 }
1206 else
1207 {
1208 if (c2 <= 0) c1 = c2 = 0x20;
1209 }
1210 }
1211 return (CHARSET_DEFINED_P (charset)
1212 && CHAR_COMPONENTS_VALID_P (charset, c1, c2));
1213 }
1214
1215 DEFUN ("char-valid-p", Fchar_valid_p, Schar_valid_p, 1, 2, 0,
1216 "Return t if OBJECT is a valid normal character.\n\
1217 If optional arg GENERICP is non-nil, also return t if OBJECT is\n\
1218 a valid generic character.")
1219 (object, genericp)
1220 Lisp_Object object, genericp;
1221 {
1222 if (! NATNUMP (object))
1223 return Qnil;
1224 return (CHAR_VALID_P (XFASTINT (object), !NILP (genericp)) ? Qt : Qnil);
1225 }
1226
1227 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
1228 Sunibyte_char_to_multibyte, 1, 1, 0,
1229 "Convert the unibyte character CH to multibyte character.\n\
1230 The conversion is done based on `nonascii-translation-table' (which see)\n\
1231 or `nonascii-insert-offset' (which see).")
1232 (ch)
1233 Lisp_Object ch;
1234 {
1235 int c;
1236
1237 CHECK_NUMBER (ch, 0);
1238 c = XINT (ch);
1239 if (c < 0 || c >= 0400)
1240 error ("Invalid unibyte character: %d", c);
1241 c = unibyte_char_to_multibyte (c);
1242 if (c < 0)
1243 error ("Can't convert to multibyte character: %d", XINT (ch));
1244 return make_number (c);
1245 }
1246
1247 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
1248 Smultibyte_char_to_unibyte, 1, 1, 0,
1249 "Convert the multibyte character CH to unibyte character.\n\
1250 The conversion is done based on `nonascii-translation-table' (which see)\n\
1251 or `nonascii-insert-offset' (which see).")
1252 (ch)
1253 Lisp_Object ch;
1254 {
1255 int c;
1256
1257 CHECK_NUMBER (ch, 0);
1258 c = XINT (ch);
1259 if (! CHAR_VALID_P (c, 0))
1260 error ("Invalid multibyte character: %d", c);
1261 c = multibyte_char_to_unibyte (c, Qnil);
1262 if (c < 0)
1263 error ("Can't convert to unibyte character: %d", XINT (ch));
1264 return make_number (c);
1265 }
1266
1267 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
1268 "Return 1 regardless of the argument CHAR.\n\
1269 This is now an obsolete function. We keep it just for backward compatibility.")
1270 (ch)
1271 Lisp_Object ch;
1272 {
1273 Lisp_Object val;
1274
1275 CHECK_NUMBER (ch, 0);
1276 return make_number (1);
1277 }
1278
1279 /* Return how many bytes C will occupy in a multibyte buffer.
1280 Don't call this function directly, instead use macro CHAR_BYTES. */
1281 int
1282 char_bytes (c)
1283 int c;
1284 {
1285 int bytes;
1286
1287 if (SINGLE_BYTE_CHAR_P (c) || (c & ~GLYPH_MASK_CHAR))
1288 return 1;
1289
1290 if (COMPOSITE_CHAR_P (c))
1291 {
1292 unsigned int id = COMPOSITE_CHAR_ID (c);
1293
1294 bytes = (id < n_cmpchars ? cmpchar_table[id]->len : 1);
1295 }
1296 else
1297 {
1298 int charset = CHAR_CHARSET (c);
1299
1300 bytes = CHARSET_DEFINED_P (charset) ? CHARSET_BYTES (charset) : 1;
1301 }
1302
1303 return bytes;
1304 }
1305
1306 /* Return the width of character of which multi-byte form starts with
1307 C. The width is measured by how many columns occupied on the
1308 screen when displayed in the current buffer. */
1309
1310 #define ONE_BYTE_CHAR_WIDTH(c) \
1311 (c < 0x20 \
1312 ? (c == '\t' \
1313 ? XFASTINT (current_buffer->tab_width) \
1314 : (c == '\n' ? 0 : (NILP (current_buffer->ctl_arrow) ? 4 : 2))) \
1315 : (c < 0x7f \
1316 ? 1 \
1317 : (c == 0x7F \
1318 ? (NILP (current_buffer->ctl_arrow) ? 4 : 2) \
1319 : ((! NILP (current_buffer->enable_multibyte_characters) \
1320 && BASE_LEADING_CODE_P (c)) \
1321 ? WIDTH_BY_CHAR_HEAD (c) \
1322 : 4))))
1323
1324 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
1325 "Return width of CHAR when displayed in the current buffer.\n\
1326 The width is measured by how many columns it occupies on the screen.")
1327 (ch)
1328 Lisp_Object ch;
1329 {
1330 Lisp_Object val, disp;
1331 int c;
1332 struct Lisp_Char_Table *dp = buffer_display_table ();
1333
1334 CHECK_NUMBER (ch, 0);
1335
1336 c = XINT (ch);
1337
1338 /* Get the way the display table would display it. */
1339 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
1340
1341 if (VECTORP (disp))
1342 XSETINT (val, XVECTOR (disp)->size);
1343 else if (SINGLE_BYTE_CHAR_P (c))
1344 XSETINT (val, ONE_BYTE_CHAR_WIDTH (c));
1345 else if (COMPOSITE_CHAR_P (c))
1346 {
1347 int id = COMPOSITE_CHAR_ID (XFASTINT (ch));
1348 XSETFASTINT (val, (id < n_cmpchars ? cmpchar_table[id]->width : 1));
1349 }
1350 else
1351 {
1352 int charset = CHAR_CHARSET (c);
1353
1354 XSETFASTINT (val, CHARSET_WIDTH (charset));
1355 }
1356 return val;
1357 }
1358
1359 /* Return width of string STR of length LEN when displayed in the
1360 current buffer. The width is measured by how many columns it
1361 occupies on the screen. */
1362
1363 int
1364 strwidth (str, len)
1365 unsigned char *str;
1366 int len;
1367 {
1368 unsigned char *endp = str + len;
1369 int width = 0;
1370 struct Lisp_Char_Table *dp = buffer_display_table ();
1371
1372 while (str < endp)
1373 {
1374 if (*str == LEADING_CODE_COMPOSITION)
1375 {
1376 int id = str_cmpchar_id (str, endp - str);
1377
1378 if (id < 0)
1379 {
1380 width += 4;
1381 str++;
1382 }
1383 else
1384 {
1385 width += cmpchar_table[id]->width;
1386 str += cmpchar_table[id]->len;
1387 }
1388 }
1389 else
1390 {
1391 Lisp_Object disp;
1392 int thislen;
1393 int c = STRING_CHAR_AND_LENGTH (str, endp - str, thislen);
1394
1395 /* Get the way the display table would display it. */
1396 if (dp)
1397 disp = DISP_CHAR_VECTOR (dp, c);
1398 else
1399 disp = Qnil;
1400
1401 if (VECTORP (disp))
1402 width += XVECTOR (disp)->size;
1403 else
1404 width += ONE_BYTE_CHAR_WIDTH (*str);
1405
1406 str += thislen;
1407 }
1408 }
1409 return width;
1410 }
1411
1412 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
1413 "Return width of STRING when displayed in the current buffer.\n\
1414 Width is measured by how many columns it occupies on the screen.\n\
1415 When calculating width of a multibyte character in STRING,\n\
1416 only the base leading-code is considered; the validity of\n\
1417 the following bytes is not checked.")
1418 (str)
1419 Lisp_Object str;
1420 {
1421 Lisp_Object val;
1422
1423 CHECK_STRING (str, 0);
1424 XSETFASTINT (val, strwidth (XSTRING (str)->data,
1425 STRING_BYTES (XSTRING (str))));
1426 return val;
1427 }
1428
1429 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
1430 "Return the direction of CHAR.\n\
1431 The returned value is 0 for left-to-right and 1 for right-to-left.")
1432 (ch)
1433 Lisp_Object ch;
1434 {
1435 int charset;
1436
1437 CHECK_NUMBER (ch, 0);
1438 charset = CHAR_CHARSET (XFASTINT (ch));
1439 if (!CHARSET_DEFINED_P (charset))
1440 invalid_character (XINT (ch));
1441 return CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX);
1442 }
1443
1444 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
1445 "Return number of characters between BEG and END.")
1446 (beg, end)
1447 Lisp_Object beg, end;
1448 {
1449 int from, to;
1450
1451 CHECK_NUMBER_COERCE_MARKER (beg, 0);
1452 CHECK_NUMBER_COERCE_MARKER (end, 1);
1453
1454 from = min (XFASTINT (beg), XFASTINT (end));
1455 to = max (XFASTINT (beg), XFASTINT (end));
1456
1457 return make_number (to - from);
1458 }
1459
1460 /* Return the number of characters in the NBYTES bytes at PTR.
1461 This works by looking at the contents and checking for multibyte sequences.
1462 However, if the current buffer has enable-multibyte-characters = nil,
1463 we treat each byte as a character. */
1464
1465 int
1466 chars_in_text (ptr, nbytes)
1467 unsigned char *ptr;
1468 int nbytes;
1469 {
1470 /* current_buffer is null at early stages of Emacs initialization. */
1471 if (current_buffer == 0
1472 || NILP (current_buffer->enable_multibyte_characters))
1473 return nbytes;
1474
1475 return multibyte_chars_in_text (ptr, nbytes);
1476 }
1477
1478 /* Return the number of characters in the NBYTES bytes at PTR.
1479 This works by looking at the contents and checking for multibyte sequences.
1480 It ignores enable-multibyte-characters. */
1481
1482 int
1483 multibyte_chars_in_text (ptr, nbytes)
1484 unsigned char *ptr;
1485 int nbytes;
1486 {
1487 unsigned char *endp;
1488 int chars, bytes;
1489
1490 endp = ptr + nbytes;
1491 chars = 0;
1492
1493 while (ptr < endp)
1494 {
1495 if (BASE_LEADING_CODE_P (*ptr))
1496 {
1497 PARSE_MULTIBYTE_SEQ (ptr, nbytes, bytes);
1498 ptr += bytes;
1499 nbytes -= bytes;
1500 }
1501 else
1502 {
1503 ptr++;
1504 nbytes--;
1505 }
1506 chars++;
1507 }
1508
1509 return chars;
1510 }
1511
1512 DEFUN ("string", Fstring, Sstring, 1, MANY, 0,
1513 "Concatenate all the argument characters and make the result a string.")
1514 (n, args)
1515 int n;
1516 Lisp_Object *args;
1517 {
1518 int i;
1519 unsigned char *buf
1520 = (unsigned char *) alloca (MAX_LENGTH_OF_MULTI_BYTE_FORM * n);
1521 unsigned char *p = buf;
1522 Lisp_Object val;
1523
1524 for (i = 0; i < n; i++)
1525 {
1526 int c, len;
1527 unsigned char *str;
1528
1529 if (!INTEGERP (args[i]))
1530 CHECK_NUMBER (args[i], 0);
1531 c = XINT (args[i]);
1532 len = CHAR_STRING (c, p, str);
1533 if (p != str)
1534 /* C is a composite character. */
1535 bcopy (str, p, len);
1536 p += len;
1537 }
1538
1539 /* Here, we can't use make_string_from_bytes because of byte
1540 combining problem. */
1541 val = make_string (buf, p - buf);
1542 return val;
1543 }
1544
1545 #endif /* emacs */
1546 \f
1547 /*** Composite characters staffs ***/
1548
1549 /* Each composite character is identified by CMPCHAR-ID which is
1550 assigned when Emacs needs the character code of the composite
1551 character (e.g. when displaying it on the screen). See the
1552 document "GENERAL NOTE on COMPOSITE CHARACTER" in `charset.h' how a
1553 composite character is represented in Emacs. */
1554
1555 /* If `static' is defined, it means that it is defined to null string. */
1556 #ifndef static
1557 /* The following function is copied from lread.c. */
1558 static int
1559 hash_string (ptr, len)
1560 unsigned char *ptr;
1561 int len;
1562 {
1563 register unsigned char *p = ptr;
1564 register unsigned char *end = p + len;
1565 register unsigned char c;
1566 register int hash = 0;
1567
1568 while (p != end)
1569 {
1570 c = *p++;
1571 if (c >= 0140) c -= 40;
1572 hash = ((hash<<3) + (hash>>28) + c);
1573 }
1574 return hash & 07777777777;
1575 }
1576 #endif
1577
1578 #define CMPCHAR_HASH_TABLE_SIZE 0xFFF
1579
1580 static int *cmpchar_hash_table[CMPCHAR_HASH_TABLE_SIZE];
1581
1582 /* Each element of `cmpchar_hash_table' is a pointer to an array of
1583 integer, where the 1st element is the size of the array, the 2nd
1584 element is how many elements are actually used in the array, and
1585 the remaining elements are CMPCHAR-IDs of composite characters of
1586 the same hash value. */
1587 #define CMPCHAR_HASH_SIZE(table) table[0]
1588 #define CMPCHAR_HASH_USED(table) table[1]
1589 #define CMPCHAR_HASH_CMPCHAR_ID(table, i) table[i]
1590
1591 /* Return CMPCHAR-ID of the composite character in STR of the length
1592 LEN. If the composite character has not yet been registered,
1593 register it in `cmpchar_table' and assign new CMPCHAR-ID. This
1594 is the sole function for assigning CMPCHAR-ID. */
1595 int
1596 str_cmpchar_id (str, len)
1597 const unsigned char *str;
1598 int len;
1599 {
1600 int hash_idx, *hashp;
1601 unsigned char *buf;
1602 int embedded_rule; /* 1 if composition rule is embedded. */
1603 int chars; /* number of components. */
1604 int i;
1605 struct cmpchar_info *cmpcharp;
1606
1607 /* The second byte 0xFF means COMPOSITION rule is embedded. */
1608 embedded_rule = (str[1] == 0xFF);
1609
1610 /* At first, get the actual length of the composite character. */
1611 {
1612 const unsigned char *p, *endp = str + 1, *lastp = str + len;
1613 int bytes;
1614
1615 while (endp < lastp && ! CHAR_HEAD_P (*endp)) endp++;
1616 if (endp - str < 5)
1617 /* Any composite char have at least 5-byte length. */
1618 return -1;
1619
1620 chars = 0;
1621 p = str + 1;
1622 while (p < endp)
1623 {
1624 if (embedded_rule)
1625 {
1626 p++;
1627 if (p >= endp)
1628 return -1;
1629 }
1630 /* No need of checking if *P is 0xA0 because
1631 BYTES_BY_CHAR_HEAD (0x80) surely returns 2. */
1632 p += BYTES_BY_CHAR_HEAD (*p - 0x20);
1633 chars++;
1634 }
1635 if (p > endp || chars < 2 || chars > MAX_COMPONENT_COUNT)
1636 /* Invalid components. */
1637 return -1;
1638 len = p - str;
1639 }
1640 hash_idx = hash_string (str, len) % CMPCHAR_HASH_TABLE_SIZE;
1641 hashp = cmpchar_hash_table[hash_idx];
1642
1643 /* Then, look into the hash table. */
1644 if (hashp != NULL)
1645 /* Find the correct one among composite characters of the same
1646 hash value. */
1647 for (i = 2; i < CMPCHAR_HASH_USED (hashp); i++)
1648 {
1649 cmpcharp = cmpchar_table[CMPCHAR_HASH_CMPCHAR_ID (hashp, i)];
1650 if (len == cmpcharp->len
1651 && ! bcmp (str, cmpcharp->data, len))
1652 return CMPCHAR_HASH_CMPCHAR_ID (hashp, i);
1653 }
1654
1655 /* We have to register the composite character in cmpchar_table. */
1656 if (n_cmpchars >= (CHAR_FIELD2_MASK | CHAR_FIELD3_MASK))
1657 /* No, we have no more room for a new composite character. */
1658 return -1;
1659
1660 /* Make the entry in hash table. */
1661 if (hashp == NULL)
1662 {
1663 /* Make a table for 8 composite characters initially. */
1664 hashp = (cmpchar_hash_table[hash_idx]
1665 = (int *) xmalloc (sizeof (int) * (2 + 8)));
1666 CMPCHAR_HASH_SIZE (hashp) = 10;
1667 CMPCHAR_HASH_USED (hashp) = 2;
1668 }
1669 else if (CMPCHAR_HASH_USED (hashp) >= CMPCHAR_HASH_SIZE (hashp))
1670 {
1671 CMPCHAR_HASH_SIZE (hashp) += 8;
1672 hashp = (cmpchar_hash_table[hash_idx]
1673 = (int *) xrealloc (hashp,
1674 sizeof (int) * CMPCHAR_HASH_SIZE (hashp)));
1675 }
1676 CMPCHAR_HASH_CMPCHAR_ID (hashp, CMPCHAR_HASH_USED (hashp)) = n_cmpchars;
1677 CMPCHAR_HASH_USED (hashp)++;
1678
1679 /* Set information of the composite character in cmpchar_table. */
1680 if (cmpchar_table_size == 0)
1681 {
1682 /* This is the first composite character to be registered. */
1683 cmpchar_table_size = 256;
1684 cmpchar_table
1685 = (struct cmpchar_info **) xmalloc (sizeof (cmpchar_table[0])
1686 * cmpchar_table_size);
1687 }
1688 else if (cmpchar_table_size <= n_cmpchars)
1689 {
1690 cmpchar_table_size += 256;
1691 cmpchar_table
1692 = (struct cmpchar_info **) xrealloc (cmpchar_table,
1693 sizeof (cmpchar_table[0])
1694 * cmpchar_table_size);
1695 }
1696
1697 cmpcharp = (struct cmpchar_info *) xmalloc (sizeof (struct cmpchar_info));
1698
1699 cmpcharp->len = len;
1700 cmpcharp->data = (unsigned char *) xmalloc (len + 1);
1701 bcopy (str, cmpcharp->data, len);
1702 cmpcharp->data[len] = 0;
1703 cmpcharp->glyph_len = chars;
1704 cmpcharp->glyph = (GLYPH *) xmalloc (sizeof (GLYPH) * chars);
1705 if (embedded_rule)
1706 {
1707 cmpcharp->cmp_rule = (unsigned char *) xmalloc (chars);
1708 cmpcharp->col_offset = (float *) xmalloc (sizeof (float) * chars);
1709 }
1710 else
1711 {
1712 cmpcharp->cmp_rule = NULL;
1713 cmpcharp->col_offset = NULL;
1714 }
1715
1716 /* Setup GLYPH data and composition rules (if any) so as not to make
1717 them every time on displaying. */
1718 {
1719 unsigned char *bufp;
1720 int width;
1721 float leftmost = 0.0, rightmost = 1.0;
1722
1723 if (embedded_rule)
1724 /* At first, col_offset[N] is set to relative to col_offset[0]. */
1725 cmpcharp->col_offset[0] = 0;
1726
1727 for (i = 0, bufp = cmpcharp->data + 1; i < chars; i++)
1728 {
1729 if (embedded_rule)
1730 cmpcharp->cmp_rule[i] = *bufp++;
1731
1732 if (*bufp == 0xA0) /* This is an ASCII character. */
1733 {
1734 cmpcharp->glyph[i] = FAST_MAKE_GLYPH ((*++bufp & 0x7F), 0);
1735 width = 1;
1736 bufp++;
1737 }
1738 else /* Multibyte character. */
1739 {
1740 /* Make `bufp' point normal multi-byte form temporally. */
1741 *bufp -= 0x20;
1742 cmpcharp->glyph[i]
1743 = FAST_MAKE_GLYPH (string_to_non_ascii_char (bufp, 4, 0), 0);
1744 width = WIDTH_BY_CHAR_HEAD (*bufp);
1745 *bufp += 0x20;
1746 bufp += BYTES_BY_CHAR_HEAD (*bufp - 0x20);
1747 }
1748
1749 if (embedded_rule && i > 0)
1750 {
1751 /* Reference points (global_ref and new_ref) are
1752 encoded as below:
1753
1754 0--1--2 -- ascent
1755 | |
1756 | |
1757 | 4 -+--- center
1758 -- 3 5 -- baseline
1759 | |
1760 6--7--8 -- descent
1761
1762 Now, we calculate the column offset of the new glyph
1763 from the left edge of the first glyph. This can avoid
1764 the same calculation everytime displaying this
1765 composite character. */
1766
1767 /* Reference points of global glyph and new glyph. */
1768 int global_ref = (cmpcharp->cmp_rule[i] - 0xA0) / 9;
1769 int new_ref = (cmpcharp->cmp_rule[i] - 0xA0) % 9;
1770 /* Column offset relative to the first glyph. */
1771 float left = (leftmost
1772 + (global_ref % 3) * (rightmost - leftmost) / 2.0
1773 - (new_ref % 3) * width / 2.0);
1774
1775 cmpcharp->col_offset[i] = left;
1776 if (left < leftmost)
1777 leftmost = left;
1778 if (left + width > rightmost)
1779 rightmost = left + width;
1780 }
1781 else
1782 {
1783 if (width > rightmost)
1784 rightmost = width;
1785 }
1786 }
1787 if (embedded_rule)
1788 {
1789 /* Now col_offset[N] are relative to the left edge of the
1790 first component. Make them relative to the left edge of
1791 overall glyph. */
1792 for (i = 0; i < chars; i++)
1793 cmpcharp->col_offset[i] -= leftmost;
1794 /* Make rightmost holds width of overall glyph. */
1795 rightmost -= leftmost;
1796 }
1797
1798 cmpcharp->width = rightmost;
1799 if (cmpcharp->width < rightmost)
1800 /* To get a ceiling integer value. */
1801 cmpcharp->width++;
1802 }
1803
1804 cmpchar_table[n_cmpchars] = cmpcharp;
1805
1806 return n_cmpchars++;
1807 }
1808
1809 /* Return the Nth element of the composite character C. If NOERROR is
1810 nonzero, return 0 on error condition (C is an invalid composite
1811 charcter, or N is out of range). */
1812 int
1813 cmpchar_component (c, n, noerror)
1814 int c, n, noerror;
1815 {
1816 int id = COMPOSITE_CHAR_ID (c);
1817
1818 if (id < 0 || id >= n_cmpchars)
1819 {
1820 /* C is not a valid composite character. */
1821 if (noerror) return 0;
1822 error ("Invalid composite character: %d", c) ;
1823 }
1824 if (n >= cmpchar_table[id]->glyph_len)
1825 {
1826 /* No such component. */
1827 if (noerror) return 0;
1828 args_out_of_range (make_number (c), make_number (n));
1829 }
1830 /* No face data is stored in glyph code. */
1831 return ((int) (cmpchar_table[id]->glyph[n]));
1832 }
1833
1834 DEFUN ("cmpcharp", Fcmpcharp, Scmpcharp, 1, 1, 0,
1835 "T if CHAR is a composite character.")
1836 (ch)
1837 Lisp_Object ch;
1838 {
1839 CHECK_NUMBER (ch, 0);
1840 return (COMPOSITE_CHAR_P (XINT (ch)) ? Qt : Qnil);
1841 }
1842
1843 DEFUN ("composite-char-component", Fcmpchar_component, Scmpchar_component,
1844 2, 2, 0,
1845 "Return the Nth component character of composite character CHARACTER.")
1846 (character, n)
1847 Lisp_Object character, n;
1848 {
1849 int id;
1850
1851 CHECK_NUMBER (character, 0);
1852 CHECK_NUMBER (n, 1);
1853
1854 return (make_number (cmpchar_component (XINT (character), XINT (n), 0)));
1855 }
1856
1857 DEFUN ("composite-char-composition-rule", Fcmpchar_cmp_rule, Scmpchar_cmp_rule,
1858 2, 2, 0,
1859 "Return the Nth composition rule of composite character CHARACTER.\n\
1860 The returned rule is for composing the Nth component\n\
1861 on the (N-1)th component.\n\
1862 If CHARACTER should be composed relatively or N is 0, return 255.")
1863 (character, n)
1864 Lisp_Object character, n;
1865 {
1866 int id;
1867
1868 CHECK_NUMBER (character, 0);
1869 CHECK_NUMBER (n, 1);
1870
1871 id = COMPOSITE_CHAR_ID (XINT (character));
1872 if (id < 0 || id >= n_cmpchars)
1873 error ("Invalid composite character: %d", XINT (character));
1874 if (XINT (n) < 0 || XINT (n) >= cmpchar_table[id]->glyph_len)
1875 args_out_of_range (character, n);
1876
1877 return make_number (cmpchar_table[id]->cmp_rule
1878 ? cmpchar_table[id]->cmp_rule[XINT (n)]
1879 : 255);
1880 }
1881
1882 DEFUN ("composite-char-composition-rule-p", Fcmpchar_cmp_rule_p,
1883 Scmpchar_cmp_rule_p, 1, 1, 0,
1884 "Return non-nil if composite character CHARACTER contains a embedded rule.")
1885 (character)
1886 Lisp_Object character;
1887 {
1888 int id;
1889
1890 CHECK_NUMBER (character, 0);
1891 id = COMPOSITE_CHAR_ID (XINT (character));
1892 if (id < 0 || id >= n_cmpchars)
1893 error ("Invalid composite character: %d", XINT (character));
1894
1895 return (cmpchar_table[id]->cmp_rule ? Qt : Qnil);
1896 }
1897
1898 DEFUN ("composite-char-component-count", Fcmpchar_cmp_count,
1899 Scmpchar_cmp_count, 1, 1, 0,
1900 "Return number of compoents of composite character CHARACTER.")
1901 (character)
1902 Lisp_Object character;
1903 {
1904 int id;
1905
1906 CHECK_NUMBER (character, 0);
1907 id = COMPOSITE_CHAR_ID (XINT (character));
1908 if (id < 0 || id >= n_cmpchars)
1909 error ("Invalid composite character: %d", XINT (character));
1910
1911 return (make_number (cmpchar_table[id]->glyph_len));
1912 }
1913
1914 DEFUN ("compose-string", Fcompose_string, Scompose_string,
1915 1, 1, 0,
1916 "Return one char string composed from all characters in STRING.")
1917 (str)
1918 Lisp_Object str;
1919 {
1920 unsigned char buf[MAX_LENGTH_OF_MULTI_BYTE_FORM], *p, *pend, *ptemp;
1921 int len, i;
1922
1923 CHECK_STRING (str, 0);
1924
1925 buf[0] = LEADING_CODE_COMPOSITION;
1926 p = XSTRING (str)->data;
1927 pend = p + STRING_BYTES (XSTRING (str));
1928 i = 1;
1929 while (p < pend)
1930 {
1931 if (*p < 0x20) /* control code */
1932 error ("Invalid component character: %d", *p);
1933 else if (*p < 0x80) /* ASCII */
1934 {
1935 if (i + 2 >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1936 error ("Too long string to be composed: %s", XSTRING (str)->data);
1937 /* Prepend an ASCII charset indicator 0xA0, set MSB of the
1938 code itself. */
1939 buf[i++] = 0xA0;
1940 buf[i++] = *p++ + 0x80;
1941 }
1942 else if (*p == LEADING_CODE_COMPOSITION) /* composite char */
1943 {
1944 /* Already composed. Eliminate the heading
1945 LEADING_CODE_COMPOSITION, keep the remaining bytes
1946 unchanged. */
1947 p++;
1948 if (*p == 255)
1949 error ("Can't compose a rule-based composition character");
1950 ptemp = p;
1951 while (! CHAR_HEAD_P (*p)) p++;
1952 if (str_cmpchar_id (ptemp - 1, p - ptemp + 1) < 0)
1953 error ("Can't compose an invalid composition character");
1954 if (i + (p - ptemp) >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1955 error ("Too long string to be composed: %s", XSTRING (str)->data);
1956 bcopy (ptemp, buf + i, p - ptemp);
1957 i += p - ptemp;
1958 }
1959 else /* multibyte char */
1960 {
1961 /* Add 0x20 to the base leading-code, keep the remaining
1962 bytes unchanged. */
1963 int c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
1964
1965 if (len <= 1 || ! CHAR_VALID_P (c, 0))
1966 error ("Can't compose an invalid character");
1967 if (i + len >= MAX_LENGTH_OF_MULTI_BYTE_FORM)
1968 error ("Too long string to be composed: %s", XSTRING (str)->data);
1969 bcopy (p, buf + i, len);
1970 buf[i] += 0x20;
1971 p += len, i += len;
1972 }
1973 }
1974
1975 if (i < 5)
1976 /* STR contains only one character, which can't be composed. */
1977 error ("Too short string to be composed: %s", XSTRING (str)->data);
1978
1979 return make_string_from_bytes (buf, 1, i);
1980 }
1981
1982 \f
1983 int
1984 charset_id_internal (charset_name)
1985 char *charset_name;
1986 {
1987 Lisp_Object val;
1988
1989 val= Fget (intern (charset_name), Qcharset);
1990 if (!VECTORP (val))
1991 error ("Charset %s is not defined", charset_name);
1992
1993 return (XINT (XVECTOR (val)->contents[0]));
1994 }
1995
1996 DEFUN ("setup-special-charsets", Fsetup_special_charsets,
1997 Ssetup_special_charsets, 0, 0, 0, "Internal use only.")
1998 ()
1999 {
2000 charset_latin_iso8859_1 = charset_id_internal ("latin-iso8859-1");
2001 charset_jisx0208_1978 = charset_id_internal ("japanese-jisx0208-1978");
2002 charset_jisx0208 = charset_id_internal ("japanese-jisx0208");
2003 charset_katakana_jisx0201 = charset_id_internal ("katakana-jisx0201");
2004 charset_latin_jisx0201 = charset_id_internal ("latin-jisx0201");
2005 charset_big5_1 = charset_id_internal ("chinese-big5-1");
2006 charset_big5_2 = charset_id_internal ("chinese-big5-2");
2007 return Qnil;
2008 }
2009
2010 void
2011 init_charset_once ()
2012 {
2013 int i, j, k;
2014
2015 staticpro (&Vcharset_table);
2016 staticpro (&Vcharset_symbol_table);
2017 staticpro (&Vgeneric_character_list);
2018
2019 /* This has to be done here, before we call Fmake_char_table. */
2020 Qcharset_table = intern ("charset-table");
2021 staticpro (&Qcharset_table);
2022
2023 /* Intern this now in case it isn't already done.
2024 Setting this variable twice is harmless.
2025 But don't staticpro it here--that is done in alloc.c. */
2026 Qchar_table_extra_slots = intern ("char-table-extra-slots");
2027
2028 /* Now we are ready to set up this property, so we can
2029 create the charset table. */
2030 Fput (Qcharset_table, Qchar_table_extra_slots, make_number (0));
2031 Vcharset_table = Fmake_char_table (Qcharset_table, Qnil);
2032
2033 Qunknown = intern ("unknown");
2034 staticpro (&Qunknown);
2035 Vcharset_symbol_table = Fmake_vector (make_number (MAX_CHARSET + 1),
2036 Qunknown);
2037
2038 /* Setup tables. */
2039 for (i = 0; i < 2; i++)
2040 for (j = 0; j < 2; j++)
2041 for (k = 0; k < 128; k++)
2042 iso_charset_table [i][j][k] = -1;
2043
2044 bzero (cmpchar_hash_table, sizeof cmpchar_hash_table);
2045 cmpchar_table_size = n_cmpchars = 0;
2046
2047 for (i = 0; i < 256; i++)
2048 BYTES_BY_CHAR_HEAD (i) = 1;
2049 for (i = MIN_CHARSET_OFFICIAL_DIMENSION1;
2050 i <= MAX_CHARSET_OFFICIAL_DIMENSION1; i++)
2051 BYTES_BY_CHAR_HEAD (i) = 2;
2052 for (i = MIN_CHARSET_OFFICIAL_DIMENSION2;
2053 i <= MAX_CHARSET_OFFICIAL_DIMENSION2; i++)
2054 BYTES_BY_CHAR_HEAD (i) = 3;
2055 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 3;
2056 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 3;
2057 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 4;
2058 BYTES_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 4;
2059 /* The followings don't reflect the actual bytes, but just to tell
2060 that it is a start of a multibyte character. */
2061 BYTES_BY_CHAR_HEAD (LEADING_CODE_COMPOSITION) = 2;
2062 BYTES_BY_CHAR_HEAD (0x9E) = 2;
2063 BYTES_BY_CHAR_HEAD (0x9F) = 2;
2064
2065 for (i = 0; i < 128; i++)
2066 WIDTH_BY_CHAR_HEAD (i) = 1;
2067 for (; i < 256; i++)
2068 WIDTH_BY_CHAR_HEAD (i) = 4;
2069 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_11) = 1;
2070 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_12) = 2;
2071 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_21) = 1;
2072 WIDTH_BY_CHAR_HEAD (LEADING_CODE_PRIVATE_22) = 2;
2073
2074 {
2075 Lisp_Object val;
2076
2077 val = Qnil;
2078 for (i = 0x81; i < 0x90; i++)
2079 val = Fcons (make_number ((i - 0x70) << 7), val);
2080 for (; i < 0x9A; i++)
2081 val = Fcons (make_number ((i - 0x8F) << 14), val);
2082 for (i = 0xA0; i < 0xF0; i++)
2083 val = Fcons (make_number ((i - 0x70) << 7), val);
2084 for (; i < 0xFF; i++)
2085 val = Fcons (make_number ((i - 0xE0) << 14), val);
2086 val = Fcons (make_number (GENERIC_COMPOSITION_CHAR), val);
2087 Vgeneric_character_list = Fnreverse (val);
2088 }
2089
2090 nonascii_insert_offset = 0;
2091 Vnonascii_translation_table = Qnil;
2092 }
2093
2094 #ifdef emacs
2095
2096 void
2097 syms_of_charset ()
2098 {
2099 Qascii = intern ("ascii");
2100 staticpro (&Qascii);
2101
2102 Qcharset = intern ("charset");
2103 staticpro (&Qcharset);
2104
2105 /* Define ASCII charset now. */
2106 update_charset_table (make_number (CHARSET_ASCII),
2107 make_number (1), make_number (94),
2108 make_number (1),
2109 make_number (0),
2110 make_number ('B'),
2111 make_number (0),
2112 build_string ("ASCII"),
2113 build_string ("ASCII"),
2114 build_string ("ASCII (ISO646 IRV)"));
2115 CHARSET_SYMBOL (CHARSET_ASCII) = Qascii;
2116 Fput (Qascii, Qcharset, CHARSET_TABLE_ENTRY (CHARSET_ASCII));
2117
2118 Qcomposition = intern ("composition");
2119 staticpro (&Qcomposition);
2120 CHARSET_SYMBOL (CHARSET_COMPOSITION) = Qcomposition;
2121
2122 Qauto_fill_chars = intern ("auto-fill-chars");
2123 staticpro (&Qauto_fill_chars);
2124 Fput (Qauto_fill_chars, Qchar_table_extra_slots, make_number (0));
2125
2126 defsubr (&Sdefine_charset);
2127 defsubr (&Sgeneric_character_list);
2128 defsubr (&Sget_unused_iso_final_char);
2129 defsubr (&Sdeclare_equiv_charset);
2130 defsubr (&Sfind_charset_region);
2131 defsubr (&Sfind_charset_string);
2132 defsubr (&Smake_char_internal);
2133 defsubr (&Ssplit_char);
2134 defsubr (&Schar_charset);
2135 defsubr (&Scharset_after);
2136 defsubr (&Siso_charset);
2137 defsubr (&Schar_valid_p);
2138 defsubr (&Sunibyte_char_to_multibyte);
2139 defsubr (&Smultibyte_char_to_unibyte);
2140 defsubr (&Schar_bytes);
2141 defsubr (&Schar_width);
2142 defsubr (&Sstring_width);
2143 defsubr (&Schar_direction);
2144 defsubr (&Schars_in_region);
2145 defsubr (&Sstring);
2146 defsubr (&Scmpcharp);
2147 defsubr (&Scmpchar_component);
2148 defsubr (&Scmpchar_cmp_rule);
2149 defsubr (&Scmpchar_cmp_rule_p);
2150 defsubr (&Scmpchar_cmp_count);
2151 defsubr (&Scompose_string);
2152 defsubr (&Ssetup_special_charsets);
2153
2154 DEFVAR_LISP ("charset-list", &Vcharset_list,
2155 "List of charsets ever defined.");
2156 Vcharset_list = Fcons (Qascii, Qnil);
2157
2158 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
2159 "Vector of cons cell of a symbol and translation table ever defined.\n\
2160 An ID of a translation table is an index of this vector.");
2161 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
2162
2163 DEFVAR_INT ("leading-code-composition", &leading_code_composition,
2164 "Leading-code of composite characters.");
2165 leading_code_composition = LEADING_CODE_COMPOSITION;
2166
2167 DEFVAR_INT ("leading-code-private-11", &leading_code_private_11,
2168 "Leading-code of private TYPE9N charset of column-width 1.");
2169 leading_code_private_11 = LEADING_CODE_PRIVATE_11;
2170
2171 DEFVAR_INT ("leading-code-private-12", &leading_code_private_12,
2172 "Leading-code of private TYPE9N charset of column-width 2.");
2173 leading_code_private_12 = LEADING_CODE_PRIVATE_12;
2174
2175 DEFVAR_INT ("leading-code-private-21", &leading_code_private_21,
2176 "Leading-code of private TYPE9Nx9N charset of column-width 1.");
2177 leading_code_private_21 = LEADING_CODE_PRIVATE_21;
2178
2179 DEFVAR_INT ("leading-code-private-22", &leading_code_private_22,
2180 "Leading-code of private TYPE9Nx9N charset of column-width 2.");
2181 leading_code_private_22 = LEADING_CODE_PRIVATE_22;
2182
2183 DEFVAR_INT ("nonascii-insert-offset", &nonascii_insert_offset,
2184 "Offset for converting non-ASCII unibyte codes 0240...0377 to multibyte.\n\
2185 This is used for converting unibyte text to multibyte,\n\
2186 and for inserting character codes specified by number.\n\n\
2187 This serves to convert a Latin-1 or similar 8-bit character code\n\
2188 to the corresponding Emacs multibyte character code.\n\
2189 Typically the value should be (- (make-char CHARSET 0) 128),\n\
2190 for your choice of character set.\n\
2191 If `nonascii-translation-table' is non-nil, it overrides this variable.");
2192 nonascii_insert_offset = 0;
2193
2194 DEFVAR_LISP ("nonascii-translation-table", &Vnonascii_translation_table,
2195 "Translation table to convert non-ASCII unibyte codes to multibyte.\n\
2196 This is used for converting unibyte text to multibyte,\n\
2197 and for inserting character codes specified by number.\n\n\
2198 Conversion is performed only when multibyte characters are enabled,\n\
2199 and it serves to convert a Latin-1 or similar 8-bit character code\n\
2200 to the corresponding Emacs character code.\n\n\
2201 If this is nil, `nonascii-insert-offset' is used instead.\n\
2202 See also the docstring of `make-translation-table'.");
2203 Vnonascii_translation_table = Qnil;
2204
2205 DEFVAR_INT ("min-composite-char", &min_composite_char,
2206 "Minimum character code of a composite character.");
2207 min_composite_char = MIN_CHAR_COMPOSITION;
2208
2209 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
2210 "A char-table for characters which invoke auto-filling.\n\
2211 Such characters has value t in this table.");
2212 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
2213 CHAR_TABLE_SET (Vauto_fill_chars, make_number (' '), Qt);
2214 CHAR_TABLE_SET (Vauto_fill_chars, make_number ('\n'), Qt);
2215 }
2216
2217 #endif /* emacs */