]> code.delx.au - gnu-emacs/blob - src/character.c
Merge from emacs--devo--0
[gnu-emacs] / src / character.c
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2005, 2006 Free Software Foundation, Inc.
5 Copyright (C) 2003, 2006
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /* At first, see the document in `character.h' to understand the code
27 in this file. */
28
29 #ifdef emacs
30 #include <config.h>
31 #endif
32
33 #include <stdio.h>
34
35 #ifdef emacs
36
37 #include <sys/types.h>
38 #include "lisp.h"
39 #include "character.h"
40 #include "buffer.h"
41 #include "charset.h"
42 #include "composite.h"
43 #include "disptab.h"
44
45 #else /* not emacs */
46
47 #include "mulelib.h"
48
49 #endif /* emacs */
50
51 Lisp_Object Qcharacterp;
52
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector;
56
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars;
59
60 Lisp_Object Qauto_fill_chars;
61
62 /* Char-table of information about which character to unify to which
63 Unicode character. */
64 Lisp_Object Vchar_unify_table;
65
66 /* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68 Lisp_Object Vprintable_chars;
69
70 /* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72 Lisp_Object Vchar_width_table;
73
74 /* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76 Lisp_Object Vchar_direction_table;
77
78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
79 unsigned char *_fetch_multibyte_char_p;
80
81 /* Char table of scripts. */
82 Lisp_Object Vchar_script_table;
83
84 /* Alist of scripts vs representative characters. */
85 Lisp_Object Vscript_representative_chars;
86
87 static Lisp_Object Qchar_script_table;
88
89 /* Mapping table from unibyte chars to multibyte chars. */
90 int unibyte_to_multibyte_table[256];
91
92 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
93 char. */
94 char unibyte_has_multibyte_table[256];
95
96 \f
97
98 /* Store multibyte form of character C at P. If C has modifier bits,
99 handle them appropriately. */
100
101 int
102 char_string (c, p)
103 int c;
104 unsigned char *p;
105 {
106 int bytes;
107
108 if (c & CHAR_MODIFIER_MASK)
109 {
110 /* As an non-ASCII character can't have modifier bits, we just
111 ignore the bits. */
112 if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
113 {
114 /* For Meta, Shift, and Control modifiers, we need special care. */
115 if (c & CHAR_META)
116 {
117 /* Move the meta bit to the right place for a string. */
118 c = (c & ~CHAR_META) | 0x80;
119 }
120 if (c & CHAR_SHIFT)
121 {
122 /* Shift modifier is valid only with [A-Za-z]. */
123 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
124 c &= ~CHAR_SHIFT;
125 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
126 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
127 }
128 if (c & CHAR_CTL)
129 {
130 /* Simulate the code in lread.c. */
131 /* Allow `\C- ' and `\C-?'. */
132 if (c == (CHAR_CTL | ' '))
133 c = 0;
134 else if (c == (CHAR_CTL | '?'))
135 c = 127;
136 /* ASCII control chars are made from letters (both cases),
137 as well as the non-letters within 0100...0137. */
138 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
139 c &= (037 | (~0177 & ~CHAR_CTL));
140 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
141 c &= (037 | (~0177 & ~CHAR_CTL));
142 }
143 }
144
145 /* If C still has any modifier bits, just ignore it. */
146 c &= ~CHAR_MODIFIER_MASK;
147 }
148
149 MAYBE_UNIFY_CHAR (c);
150
151 if (c <= MAX_3_BYTE_CHAR)
152 {
153 bytes = CHAR_STRING (c, p);
154 }
155 else if (c <= MAX_4_BYTE_CHAR)
156 {
157 p[0] = (0xF0 | (c >> 18));
158 p[1] = (0x80 | ((c >> 12) & 0x3F));
159 p[2] = (0x80 | ((c >> 6) & 0x3F));
160 p[3] = (0x80 | (c & 0x3F));
161 bytes = 4;
162 }
163 else if (c <= MAX_5_BYTE_CHAR)
164 {
165 p[0] = 0xF8;
166 p[1] = (0x80 | ((c >> 18) & 0x0F));
167 p[2] = (0x80 | ((c >> 12) & 0x3F));
168 p[3] = (0x80 | ((c >> 6) & 0x3F));
169 p[4] = (0x80 | (c & 0x3F));
170 bytes = 5;
171 }
172 else
173 {
174 c = CHAR_TO_BYTE8 (c);
175 bytes = BYTE8_STRING (c, p);
176 }
177
178 return bytes;
179 }
180
181
182 /* Return a character whose multibyte form is at P. Set LEN is not
183 NULL, it must be a pointer to integer. In that case, set *LEN to
184 the byte length of the multibyte form. If ADVANCED is not NULL, is
185 must be a pointer to unsigned char. In that case, set *ADVANCED to
186 the ending address (i.e. the starting address of the next
187 character) of the multibyte form. */
188
189 int
190 string_char (p, advanced, len)
191 const unsigned char *p;
192 const unsigned char **advanced;
193 int *len;
194 {
195 int c;
196 const unsigned char *saved_p = p;
197
198 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
199 {
200 c = STRING_CHAR_ADVANCE (p);
201 }
202 else if (! (*p & 0x08))
203 {
204 c = ((((p)[0] & 0xF) << 18)
205 | (((p)[1] & 0x3F) << 12)
206 | (((p)[2] & 0x3F) << 6)
207 | ((p)[3] & 0x3F));
208 p += 4;
209 }
210 else
211 {
212 c = ((((p)[1] & 0x3F) << 18)
213 | (((p)[2] & 0x3F) << 12)
214 | (((p)[3] & 0x3F) << 6)
215 | ((p)[4] & 0x3F));
216 p += 5;
217 }
218
219 MAYBE_UNIFY_CHAR (c);
220
221 if (len)
222 *len = p - saved_p;
223 if (advanced)
224 *advanced = p;
225 return c;
226 }
227
228
229 /* Translate character C by translation table TABLE. If C is
230 negative, translate a character specified by CHARSET and CODE. If
231 no translation is found in TABLE, return the untranslated
232 character. If TABLE is a list, elements are char tables. In this
233 case, translace C by all tables. */
234
235 int
236 translate_char (table, c)
237 Lisp_Object table;
238 int c;
239 {
240 if (CHAR_TABLE_P (table))
241 {
242 Lisp_Object ch;
243
244 ch = CHAR_TABLE_REF (table, c);
245 if (CHARACTERP (ch))
246 c = XINT (ch);
247 }
248 else
249 {
250 for (; CONSP (table); table = XCDR (table))
251 c = translate_char (XCAR (table), c);
252 }
253 return c;
254 }
255
256 /* Convert the multibyte character C to unibyte 8-bit character based
257 on the current value of charset_unibyte. If dimension of
258 charset_unibyte is more than one, return (C & 0xFF).
259
260 The argument REV_TBL is now ignored. It will be removed in the
261 future. */
262
263 int
264 multibyte_char_to_unibyte (c, rev_tbl)
265 int c;
266 Lisp_Object rev_tbl;
267 {
268 struct charset *charset;
269 unsigned c1;
270
271 if (CHAR_BYTE8_P (c))
272 return CHAR_TO_BYTE8 (c);
273 charset = CHARSET_FROM_ID (charset_unibyte);
274 c1 = ENCODE_CHAR (charset, c);
275 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
276 }
277
278
279 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
280 doc: /* Return non-nil if OBJECT is a character. */)
281 (object, ignore)
282 Lisp_Object object, ignore;
283 {
284 return (CHARACTERP (object) ? Qt : Qnil);
285 }
286
287 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
288 doc: /* Return the character of the maximum code. */)
289 ()
290 {
291 return make_number (MAX_CHAR);
292 }
293
294 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
295 Sunibyte_char_to_multibyte, 1, 1, 0,
296 doc: /* Convert the unibyte character CH to multibyte character.
297 The multibyte character is a result of decoding CH by
298 the current unibyte charset (see `unibyte-charset'). */)
299 (ch)
300 Lisp_Object ch;
301 {
302 int c;
303 struct charset *charset;
304
305 CHECK_CHARACTER (ch);
306 c = XFASTINT (ch);
307 if (c >= 0400)
308 error ("Invalid unibyte character: %d", c);
309 charset = CHARSET_FROM_ID (charset_unibyte);
310 c = DECODE_CHAR (charset, c);
311 if (c < 0)
312 c = BYTE8_TO_CHAR (XFASTINT (ch));
313 return make_number (c);
314 }
315
316 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
317 Smultibyte_char_to_unibyte, 1, 1, 0,
318 doc: /* Convert the multibyte character CH to unibyte character.\n\
319 The unibyte character is a result of encoding CH by
320 the current primary charset (value of `charset-primary'). */)
321 (ch)
322 Lisp_Object ch;
323 {
324 int c;
325
326 CHECK_CHARACTER (ch);
327 c = XFASTINT (ch);
328 c = CHAR_TO_BYTE8 (c);
329 return make_number (c);
330 }
331
332 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
333 doc: /* Return 1 regardless of the argument CHAR.
334 This is now an obsolete function. We keep it just for backward compatibility. */)
335 (ch)
336 Lisp_Object ch;
337 {
338 CHECK_CHARACTER (ch);
339 return make_number (1);
340 }
341
342 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
343 doc: /* Return width of CHAR when displayed in the current buffer.
344 The width is measured by how many columns it occupies on the screen.
345 Tab is taken to occupy `tab-width' columns. */)
346 (ch)
347 Lisp_Object ch;
348 {
349 Lisp_Object disp;
350 int c, width;
351 struct Lisp_Char_Table *dp = buffer_display_table ();
352
353 CHECK_CHARACTER (ch);
354 c = XINT (ch);
355
356 /* Get the way the display table would display it. */
357 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
358
359 if (VECTORP (disp))
360 width = ASIZE (disp);
361 else
362 width = CHAR_WIDTH (c);
363
364 return make_number (width);
365 }
366
367 /* Return width of string STR of length LEN when displayed in the
368 current buffer. The width is measured by how many columns it
369 occupies on the screen. If PRECISION > 0, return the width of
370 longest substring that doesn't exceed PRECISION, and set number of
371 characters and bytes of the substring in *NCHARS and *NBYTES
372 respectively. */
373
374 int
375 c_string_width (str, len, precision, nchars, nbytes)
376 const unsigned char *str;
377 int precision, *nchars, *nbytes;
378 {
379 int i = 0, i_byte = 0;
380 int width = 0;
381 struct Lisp_Char_Table *dp = buffer_display_table ();
382
383 while (i_byte < len)
384 {
385 int bytes, thiswidth;
386 Lisp_Object val;
387 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
388
389 if (dp)
390 {
391 val = DISP_CHAR_VECTOR (dp, c);
392 if (VECTORP (val))
393 thiswidth = XVECTOR (val)->size;
394 else
395 thiswidth = CHAR_WIDTH (c);
396 }
397 else
398 {
399 thiswidth = CHAR_WIDTH (c);
400 }
401
402 if (precision > 0
403 && (width + thiswidth > precision))
404 {
405 *nchars = i;
406 *nbytes = i_byte;
407 return width;
408 }
409 i++;
410 i_byte += bytes;
411 width += thiswidth;
412 }
413
414 if (precision > 0)
415 {
416 *nchars = i;
417 *nbytes = i_byte;
418 }
419
420 return width;
421 }
422
423 /* Return width of string STR of length LEN when displayed in the
424 current buffer. The width is measured by how many columns it
425 occupies on the screen. */
426
427 int
428 strwidth (str, len)
429 unsigned char *str;
430 int len;
431 {
432 return c_string_width (str, len, -1, NULL, NULL);
433 }
434
435 /* Return width of Lisp string STRING when displayed in the current
436 buffer. The width is measured by how many columns it occupies on
437 the screen while paying attention to compositions. If PRECISION >
438 0, return the width of longest substring that doesn't exceed
439 PRECISION, and set number of characters and bytes of the substring
440 in *NCHARS and *NBYTES respectively. */
441
442 int
443 lisp_string_width (string, precision, nchars, nbytes)
444 Lisp_Object string;
445 int precision, *nchars, *nbytes;
446 {
447 int len = SCHARS (string);
448 /* This set multibyte to 0 even if STRING is multibyte when it
449 contains only ascii and eight-bit-graphic, but that's
450 intentional. */
451 int multibyte = len < SBYTES (string);
452 unsigned char *str = SDATA (string);
453 int i = 0, i_byte = 0;
454 int width = 0;
455 struct Lisp_Char_Table *dp = buffer_display_table ();
456
457 while (i < len)
458 {
459 int chars, bytes, thiswidth;
460 Lisp_Object val;
461 int cmp_id;
462 EMACS_INT ignore, end;
463
464 if (find_composition (i, -1, &ignore, &end, &val, string)
465 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
466 >= 0))
467 {
468 thiswidth = composition_table[cmp_id]->width;
469 chars = end - i;
470 bytes = string_char_to_byte (string, end) - i_byte;
471 }
472 else
473 {
474 int c;
475
476 if (multibyte)
477 c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
478 else
479 c = str[i_byte], bytes = 1;
480 chars = 1;
481 if (dp)
482 {
483 val = DISP_CHAR_VECTOR (dp, c);
484 if (VECTORP (val))
485 thiswidth = XVECTOR (val)->size;
486 else
487 thiswidth = CHAR_WIDTH (c);
488 }
489 else
490 {
491 thiswidth = CHAR_WIDTH (c);
492 }
493 }
494
495 if (precision > 0
496 && (width + thiswidth > precision))
497 {
498 *nchars = i;
499 *nbytes = i_byte;
500 return width;
501 }
502 i += chars;
503 i_byte += bytes;
504 width += thiswidth;
505 }
506
507 if (precision > 0)
508 {
509 *nchars = i;
510 *nbytes = i_byte;
511 }
512
513 return width;
514 }
515
516 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
517 doc: /* Return width of STRING when displayed in the current buffer.
518 Width is measured by how many columns it occupies on the screen.
519 When calculating width of a multibyte character in STRING,
520 only the base leading-code is considered; the validity of
521 the following bytes is not checked. Tabs in STRING are always
522 taken to occupy `tab-width' columns. */)
523 (str)
524 Lisp_Object str;
525 {
526 Lisp_Object val;
527
528 CHECK_STRING (str);
529 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
530 return val;
531 }
532
533 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
534 doc: /* Return the direction of CHAR.
535 The returned value is 0 for left-to-right and 1 for right-to-left. */)
536 (ch)
537 Lisp_Object ch;
538 {
539 int c;
540
541 CHECK_CHARACTER (ch);
542 c = XINT (ch);
543 return CHAR_TABLE_REF (Vchar_direction_table, c);
544 }
545
546 /* Return the number of characters in the NBYTES bytes at PTR.
547 This works by looking at the contents and checking for multibyte
548 sequences while assuming that there's no invalid sequence.
549 However, if the current buffer has enable-multibyte-characters =
550 nil, we treat each byte as a character. */
551
552 int
553 chars_in_text (ptr, nbytes)
554 const unsigned char *ptr;
555 int nbytes;
556 {
557 /* current_buffer is null at early stages of Emacs initialization. */
558 if (current_buffer == 0
559 || NILP (current_buffer->enable_multibyte_characters))
560 return nbytes;
561
562 return multibyte_chars_in_text (ptr, nbytes);
563 }
564
565 /* Return the number of characters in the NBYTES bytes at PTR.
566 This works by looking at the contents and checking for multibyte
567 sequences while assuming that there's no invalid sequence. It
568 ignores enable-multibyte-characters. */
569
570 int
571 multibyte_chars_in_text (ptr, nbytes)
572 const unsigned char *ptr;
573 int nbytes;
574 {
575 const unsigned char *endp = ptr + nbytes;
576 int chars = 0;
577
578 while (ptr < endp)
579 {
580 int len = MULTIBYTE_LENGTH (ptr, endp);
581
582 if (len == 0)
583 abort ();
584 ptr += len;
585 chars++;
586 }
587
588 return chars;
589 }
590
591 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
592 characters and bytes in it, and store them in *NCHARS and *NBYTES
593 respectively. On counting bytes, pay attention to that 8-bit
594 characters not constructing a valid multibyte sequence are
595 represented by 2-byte in a multibyte text. */
596
597 void
598 parse_str_as_multibyte (str, len, nchars, nbytes)
599 const unsigned char *str;
600 int len, *nchars, *nbytes;
601 {
602 const unsigned char *endp = str + len;
603 int n, chars = 0, bytes = 0;
604
605 if (len >= MAX_MULTIBYTE_LENGTH)
606 {
607 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
608 while (str < adjusted_endp)
609 {
610 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
611 str += n, bytes += n;
612 else
613 str++, bytes += 2;
614 chars++;
615 }
616 }
617 while (str < endp)
618 {
619 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
620 str += n, bytes += n;
621 else
622 str++, bytes += 2;
623 chars++;
624 }
625
626 *nchars = chars;
627 *nbytes = bytes;
628 return;
629 }
630
631 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
632 It actually converts only such 8-bit characters that don't contruct
633 a multibyte sequence to multibyte forms of Latin-1 characters. If
634 NCHARS is nonzero, set *NCHARS to the number of characters in the
635 text. It is assured that we can use LEN bytes at STR as a work
636 area and that is enough. Return the number of bytes of the
637 resulting text. */
638
639 int
640 str_as_multibyte (str, len, nbytes, nchars)
641 unsigned char *str;
642 int len, nbytes, *nchars;
643 {
644 unsigned char *p = str, *endp = str + nbytes;
645 unsigned char *to;
646 int chars = 0;
647 int n;
648
649 if (nbytes >= MAX_MULTIBYTE_LENGTH)
650 {
651 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
652 while (p < adjusted_endp
653 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
654 p += n, chars++;
655 }
656 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
657 p += n, chars++;
658 if (nchars)
659 *nchars = chars;
660 if (p == endp)
661 return nbytes;
662
663 to = p;
664 nbytes = endp - p;
665 endp = str + len;
666 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
667 p = endp - nbytes;
668
669 if (nbytes >= MAX_MULTIBYTE_LENGTH)
670 {
671 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
672 while (p < adjusted_endp)
673 {
674 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
675 {
676 while (n--)
677 *to++ = *p++;
678 }
679 else
680 {
681 int c = *p++;
682 c = BYTE8_TO_CHAR (c);
683 to += CHAR_STRING (c, to);
684 }
685 }
686 chars++;
687 }
688 while (p < endp)
689 {
690 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
691 {
692 while (n--)
693 *to++ = *p++;
694 }
695 else
696 {
697 int c = *p++;
698 c = BYTE8_TO_CHAR (c);
699 to += CHAR_STRING (c, to);
700 }
701 chars++;
702 }
703 if (nchars)
704 *nchars = chars;
705 return (to - str);
706 }
707
708 /* Parse unibyte string at STR of LEN bytes, and return the number of
709 bytes it may ocupy when converted to multibyte string by
710 `str_to_multibyte'. */
711
712 int
713 parse_str_to_multibyte (str, len)
714 unsigned char *str;
715 int len;
716 {
717 unsigned char *endp = str + len;
718 int bytes;
719
720 for (bytes = 0; str < endp; str++)
721 bytes += (*str < 0x80) ? 1 : 2;
722 return bytes;
723 }
724
725
726 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
727 that contains the same single-byte characters. It actually
728 converts all 8-bit characters to multibyte forms. It is assured
729 that we can use LEN bytes at STR as a work area and that is
730 enough. */
731
732 int
733 str_to_multibyte (str, len, bytes)
734 unsigned char *str;
735 int len, bytes;
736 {
737 unsigned char *p = str, *endp = str + bytes;
738 unsigned char *to;
739
740 while (p < endp && *p < 0x80) p++;
741 if (p == endp)
742 return bytes;
743 to = p;
744 bytes = endp - p;
745 endp = str + len;
746 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
747 p = endp - bytes;
748 while (p < endp)
749 {
750 int c = *p++;
751
752 if (c >= 0x80)
753 c = BYTE8_TO_CHAR (c);
754 to += CHAR_STRING (c, to);
755 }
756 return (to - str);
757 }
758
759 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
760 actually converts characters in the range 0x80..0xFF to
761 unibyte. */
762
763 int
764 str_as_unibyte (str, bytes)
765 unsigned char *str;
766 int bytes;
767 {
768 const unsigned char *p = str, *endp = str + bytes;
769 unsigned char *to;
770 int c, len;
771
772 while (p < endp)
773 {
774 c = *p;
775 len = BYTES_BY_CHAR_HEAD (c);
776 if (CHAR_BYTE8_HEAD_P (c))
777 break;
778 p += len;
779 }
780 to = str + (p - str);
781 while (p < endp)
782 {
783 c = *p;
784 len = BYTES_BY_CHAR_HEAD (c);
785 if (CHAR_BYTE8_HEAD_P (c))
786 {
787 c = STRING_CHAR_ADVANCE (p);
788 *to++ = CHAR_TO_BYTE8 (c);
789 }
790 else
791 {
792 while (len--) *to++ = *p++;
793 }
794 }
795 return (to - str);
796 }
797
798 int
799 string_count_byte8 (string)
800 Lisp_Object string;
801 {
802 int multibyte = STRING_MULTIBYTE (string);
803 int nbytes = SBYTES (string);
804 unsigned char *p = SDATA (string);
805 unsigned char *pend = p + nbytes;
806 int count = 0;
807 int c, len;
808
809 if (multibyte)
810 while (p < pend)
811 {
812 c = *p;
813 len = BYTES_BY_CHAR_HEAD (c);
814
815 if (CHAR_BYTE8_HEAD_P (c))
816 count++;
817 p += len;
818 }
819 else
820 while (p < pend)
821 {
822 if (*p++ >= 0x80)
823 count++;
824 }
825 return count;
826 }
827
828
829 Lisp_Object
830 string_escape_byte8 (string)
831 Lisp_Object string;
832 {
833 int nchars = SCHARS (string);
834 int nbytes = SBYTES (string);
835 int multibyte = STRING_MULTIBYTE (string);
836 int byte8_count;
837 const unsigned char *src, *src_end;
838 unsigned char *dst;
839 Lisp_Object val;
840 int c, len;
841
842 if (multibyte && nchars == nbytes)
843 return string;
844
845 byte8_count = string_count_byte8 (string);
846
847 if (byte8_count == 0)
848 return string;
849
850 if (multibyte)
851 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
852 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
853 nbytes + byte8_count * 2);
854 else
855 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
856 val = make_uninit_string (nbytes + byte8_count * 3);
857
858 src = SDATA (string);
859 src_end = src + nbytes;
860 dst = SDATA (val);
861 if (multibyte)
862 while (src < src_end)
863 {
864 c = *src;
865 len = BYTES_BY_CHAR_HEAD (c);
866
867 if (CHAR_BYTE8_HEAD_P (c))
868 {
869 c = STRING_CHAR_ADVANCE (src);
870 c = CHAR_TO_BYTE8 (c);
871 sprintf ((char *) dst, "\\%03o", c);
872 dst += 4;
873 }
874 else
875 while (len--) *dst++ = *src++;
876 }
877 else
878 while (src < src_end)
879 {
880 c = *src++;
881 if (c >= 0x80)
882 {
883 sprintf ((char *) dst, "\\%03o", c);
884 dst += 4;
885 }
886 else
887 *dst++ = c;
888 }
889 return val;
890 }
891
892 \f
893 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
894 doc: /*
895 Concatenate all the argument characters and make the result a string.
896 usage: (string &rest CHARACTERS) */)
897 (n, args)
898 int n;
899 Lisp_Object *args;
900 {
901 int i;
902 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
903 unsigned char *p = buf;
904 int c;
905
906 for (i = 0; i < n; i++)
907 {
908 CHECK_CHARACTER (args[i]);
909 c = XINT (args[i]);
910 p += CHAR_STRING (c, p);
911 }
912
913 return make_string_from_bytes ((char *) buf, n, p - buf);
914 }
915
916 void
917 init_character_once ()
918 {
919 }
920
921 #ifdef emacs
922
923 void
924 syms_of_character ()
925 {
926 DEFSYM (Qcharacterp, "characterp");
927 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
928
929 staticpro (&Vchar_unify_table);
930 Vchar_unify_table = Qnil;
931
932 defsubr (&Smax_char);
933 defsubr (&Scharacterp);
934 defsubr (&Sunibyte_char_to_multibyte);
935 defsubr (&Smultibyte_char_to_unibyte);
936 defsubr (&Schar_bytes);
937 defsubr (&Schar_width);
938 defsubr (&Sstring_width);
939 defsubr (&Schar_direction);
940 defsubr (&Sstring);
941
942 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
943 doc: /*
944 Vector recording all translation tables ever defined.
945 Each element is a pair (SYMBOL . TABLE) relating the table to the
946 symbol naming it. The ID of a translation table is an index into this vector. */);
947 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
948
949 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
950 doc: /*
951 A char-table for characters which invoke auto-filling.
952 Such characters have value t in this table. */);
953 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
954 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
955 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
956
957 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
958 doc: /*
959 A char-table for width (columns) of each character. */);
960 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
961 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
962 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
963 make_number (4));
964
965 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
966 doc: /* A char-table for direction of each character. */);
967 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
968
969 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
970 doc: /* A char-table for each printable character. */);
971 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
972 Fset_char_table_range (Vprintable_chars,
973 Fcons (make_number (32), make_number (126)), Qt);
974 Fset_char_table_range (Vprintable_chars,
975 Fcons (make_number (160),
976 make_number (MAX_5_BYTE_CHAR)), Qt);
977
978 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
979 doc: /* Char table of script symbols.
980 It has one extra slot whose value is a list of script symbols. */);
981
982 /* Intern this now in case it isn't already done.
983 Setting this variable twice is harmless.
984 But don't staticpro it here--that is done in alloc.c. */
985 Qchar_table_extra_slots = intern ("char-table-extra-slots");
986 DEFSYM (Qchar_script_table, "char-script-table");
987 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
988 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
989
990 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
991 doc: /* Alist of scripts vs the representative characters. */);
992 Vscript_representative_chars = Qnil;
993 }
994
995 #endif /* emacs */
996
997 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
998 (do not change this comment) */