]> code.delx.au - gnu-emacs/blob - src/character.c
Merge from emacs--devo--0
[gnu-emacs] / src / character.c
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2005 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /* At first, see the document in `character.h' to understand the code
27 in this file. */
28
29 #ifdef emacs
30 #include <config.h>
31 #endif
32
33 #include <stdio.h>
34
35 #ifdef emacs
36
37 #include <sys/types.h>
38 #include "lisp.h"
39 #include "character.h"
40 #include "buffer.h"
41 #include "charset.h"
42 #include "composite.h"
43 #include "disptab.h"
44
45 #else /* not emacs */
46
47 #include "mulelib.h"
48
49 #endif /* emacs */
50
51 Lisp_Object Qcharacterp;
52
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector;
56
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars;
59
60 Lisp_Object Qauto_fill_chars;
61
62 /* Char-table of information about which character to unify to which
63 Unicode character. */
64 Lisp_Object Vchar_unify_table;
65
66 /* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68 Lisp_Object Vprintable_chars;
69
70 /* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72 Lisp_Object Vchar_width_table;
73
74 /* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76 Lisp_Object Vchar_direction_table;
77
78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
79 unsigned char *_fetch_multibyte_char_p;
80
81 /* Char table of scripts. */
82 Lisp_Object Vchar_script_table;
83
84 static Lisp_Object Qchar_script_table;
85
86 /* Mapping table from unibyte chars to multibyte chars. */
87 int unibyte_to_multibyte_table[256];
88
89 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
90 char. */
91 char unibyte_has_multibyte_table[256];
92
93 \f
94
95 /* Store multibyte form of character C at P. If C has modifier bits,
96 handle them appropriately. */
97
98 int
99 char_string (c, p)
100 int c;
101 unsigned char *p;
102 {
103 int bytes;
104
105 if (c & CHAR_MODIFIER_MASK)
106 {
107 /* As an non-ASCII character can't have modifier bits, we just
108 ignore the bits. */
109 if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
110 {
111 /* For Meta, Shift, and Control modifiers, we need special care. */
112 if (c & CHAR_META)
113 {
114 /* Move the meta bit to the right place for a string. */
115 c = (c & ~CHAR_META) | 0x80;
116 }
117 if (c & CHAR_SHIFT)
118 {
119 /* Shift modifier is valid only with [A-Za-z]. */
120 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
121 c &= ~CHAR_SHIFT;
122 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
123 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
124 }
125 if (c & CHAR_CTL)
126 {
127 /* Simulate the code in lread.c. */
128 /* Allow `\C- ' and `\C-?'. */
129 if (c == (CHAR_CTL | ' '))
130 c = 0;
131 else if (c == (CHAR_CTL | '?'))
132 c = 127;
133 /* ASCII control chars are made from letters (both cases),
134 as well as the non-letters within 0100...0137. */
135 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
136 c &= (037 | (~0177 & ~CHAR_CTL));
137 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
138 c &= (037 | (~0177 & ~CHAR_CTL));
139 }
140 }
141
142 /* If C still has any modifier bits, just ignore it. */
143 c &= ~CHAR_MODIFIER_MASK;
144 }
145
146 MAYBE_UNIFY_CHAR (c);
147
148 if (c <= MAX_3_BYTE_CHAR)
149 {
150 bytes = CHAR_STRING (c, p);
151 }
152 else if (c <= MAX_4_BYTE_CHAR)
153 {
154 p[0] = (0xF0 | (c >> 18));
155 p[1] = (0x80 | ((c >> 12) & 0x3F));
156 p[2] = (0x80 | ((c >> 6) & 0x3F));
157 p[3] = (0x80 | (c & 0x3F));
158 bytes = 4;
159 }
160 else if (c <= MAX_5_BYTE_CHAR)
161 {
162 p[0] = 0xF8;
163 p[1] = (0x80 | ((c >> 18) & 0x0F));
164 p[2] = (0x80 | ((c >> 12) & 0x3F));
165 p[3] = (0x80 | ((c >> 6) & 0x3F));
166 p[4] = (0x80 | (c & 0x3F));
167 bytes = 5;
168 }
169 else
170 {
171 c = CHAR_TO_BYTE8 (c);
172 bytes = BYTE8_STRING (c, p);
173 }
174
175 return bytes;
176 }
177
178
179 /* Return a character whose multibyte form is at P. Set LEN is not
180 NULL, it must be a pointer to integer. In that case, set *LEN to
181 the byte length of the multibyte form. If ADVANCED is not NULL, is
182 must be a pointer to unsigned char. In that case, set *ADVANCED to
183 the ending address (i.e. the starting address of the next
184 character) of the multibyte form. */
185
186 int
187 string_char (p, advanced, len)
188 const unsigned char *p;
189 const unsigned char **advanced;
190 int *len;
191 {
192 int c;
193 const unsigned char *saved_p = p;
194
195 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
196 {
197 c = STRING_CHAR_ADVANCE (p);
198 }
199 else if (! (*p & 0x08))
200 {
201 c = ((((p)[0] & 0xF) << 18)
202 | (((p)[1] & 0x3F) << 12)
203 | (((p)[2] & 0x3F) << 6)
204 | ((p)[3] & 0x3F));
205 p += 4;
206 }
207 else
208 {
209 c = ((((p)[1] & 0x3F) << 18)
210 | (((p)[2] & 0x3F) << 12)
211 | (((p)[3] & 0x3F) << 6)
212 | ((p)[4] & 0x3F));
213 p += 5;
214 }
215
216 MAYBE_UNIFY_CHAR (c);
217
218 if (len)
219 *len = p - saved_p;
220 if (advanced)
221 *advanced = p;
222 return c;
223 }
224
225
226 /* Translate character C by translation table TABLE. If C is
227 negative, translate a character specified by CHARSET and CODE. If
228 no translation is found in TABLE, return the untranslated
229 character. If TABLE is a list, elements are char tables. In this
230 case, translace C by all tables. */
231
232 int
233 translate_char (table, c)
234 Lisp_Object table;
235 int c;
236 {
237 if (CHAR_TABLE_P (table))
238 {
239 Lisp_Object ch;
240
241 ch = CHAR_TABLE_REF (table, c);
242 if (CHARACTERP (ch))
243 c = XINT (ch);
244 }
245 else
246 {
247 for (; CONSP (table); table = XCDR (table))
248 c = translate_char (XCAR (table), c);
249 }
250 return c;
251 }
252
253 /* Convert the multibyte character C to unibyte 8-bit character based
254 on the current value of charset_unibyte. If dimension of
255 charset_unibyte is more than one, return (C & 0xFF).
256
257 The argument REV_TBL is now ignored. It will be removed in the
258 future. */
259
260 int
261 multibyte_char_to_unibyte (c, rev_tbl)
262 int c;
263 Lisp_Object rev_tbl;
264 {
265 struct charset *charset;
266 unsigned c1;
267
268 if (CHAR_BYTE8_P (c))
269 return CHAR_TO_BYTE8 (c);
270 charset = CHARSET_FROM_ID (charset_unibyte);
271 c1 = ENCODE_CHAR (charset, c);
272 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
273 }
274
275
276 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
277 doc: /* Return non-nil if OBJECT is a character. */)
278 (object, ignore)
279 Lisp_Object object, ignore;
280 {
281 return (CHARACTERP (object) ? Qt : Qnil);
282 }
283
284 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
285 doc: /* Return the character of the maximum code. */)
286 ()
287 {
288 return make_number (MAX_CHAR);
289 }
290
291 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
292 Sunibyte_char_to_multibyte, 1, 1, 0,
293 doc: /* Convert the unibyte character CH to multibyte character.
294 The multibyte character is a result of decoding CH by
295 the current unibyte charset (see `unibyte-charset'). */)
296 (ch)
297 Lisp_Object ch;
298 {
299 int c;
300 struct charset *charset;
301
302 CHECK_CHARACTER (ch);
303 c = XFASTINT (ch);
304 if (c >= 0400)
305 error ("Invalid unibyte character: %d", c);
306 charset = CHARSET_FROM_ID (charset_unibyte);
307 c = DECODE_CHAR (charset, c);
308 if (c < 0)
309 c = BYTE8_TO_CHAR (XFASTINT (ch));
310 return make_number (c);
311 }
312
313 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
314 Smultibyte_char_to_unibyte, 1, 1, 0,
315 doc: /* Convert the multibyte character CH to unibyte character.\n\
316 The unibyte character is a result of encoding CH by
317 the current primary charset (value of `charset-primary'). */)
318 (ch)
319 Lisp_Object ch;
320 {
321 int c;
322
323 CHECK_CHARACTER (ch);
324 c = XFASTINT (ch);
325 c = CHAR_TO_BYTE8 (c);
326 return make_number (c);
327 }
328
329 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
330 doc: /* Return 1 regardless of the argument CHAR.
331 This is now an obsolete function. We keep it just for backward compatibility. */)
332 (ch)
333 Lisp_Object ch;
334 {
335 CHECK_CHARACTER (ch);
336 return make_number (1);
337 }
338
339 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
340 doc: /* Return width of CHAR when displayed in the current buffer.
341 The width is measured by how many columns it occupies on the screen.
342 Tab is taken to occupy `tab-width' columns. */)
343 (ch)
344 Lisp_Object ch;
345 {
346 Lisp_Object disp;
347 int c, width;
348 struct Lisp_Char_Table *dp = buffer_display_table ();
349
350 CHECK_CHARACTER (ch);
351 c = XINT (ch);
352
353 /* Get the way the display table would display it. */
354 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
355
356 if (VECTORP (disp))
357 width = ASIZE (disp);
358 else
359 width = CHAR_WIDTH (c);
360
361 return make_number (width);
362 }
363
364 /* Return width of string STR of length LEN when displayed in the
365 current buffer. The width is measured by how many columns it
366 occupies on the screen. If PRECISION > 0, return the width of
367 longest substring that doesn't exceed PRECISION, and set number of
368 characters and bytes of the substring in *NCHARS and *NBYTES
369 respectively. */
370
371 int
372 c_string_width (str, len, precision, nchars, nbytes)
373 const unsigned char *str;
374 int precision, *nchars, *nbytes;
375 {
376 int i = 0, i_byte = 0;
377 int width = 0;
378 struct Lisp_Char_Table *dp = buffer_display_table ();
379
380 while (i_byte < len)
381 {
382 int bytes, thiswidth;
383 Lisp_Object val;
384 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
385
386 if (dp)
387 {
388 val = DISP_CHAR_VECTOR (dp, c);
389 if (VECTORP (val))
390 thiswidth = XVECTOR (val)->size;
391 else
392 thiswidth = CHAR_WIDTH (c);
393 }
394 else
395 {
396 thiswidth = CHAR_WIDTH (c);
397 }
398
399 if (precision > 0
400 && (width + thiswidth > precision))
401 {
402 *nchars = i;
403 *nbytes = i_byte;
404 return width;
405 }
406 i++;
407 i_byte += bytes;
408 width += thiswidth;
409 }
410
411 if (precision > 0)
412 {
413 *nchars = i;
414 *nbytes = i_byte;
415 }
416
417 return width;
418 }
419
420 /* Return width of string STR of length LEN when displayed in the
421 current buffer. The width is measured by how many columns it
422 occupies on the screen. */
423
424 int
425 strwidth (str, len)
426 unsigned char *str;
427 int len;
428 {
429 return c_string_width (str, len, -1, NULL, NULL);
430 }
431
432 /* Return width of Lisp string STRING when displayed in the current
433 buffer. The width is measured by how many columns it occupies on
434 the screen while paying attention to compositions. If PRECISION >
435 0, return the width of longest substring that doesn't exceed
436 PRECISION, and set number of characters and bytes of the substring
437 in *NCHARS and *NBYTES respectively. */
438
439 int
440 lisp_string_width (string, precision, nchars, nbytes)
441 Lisp_Object string;
442 int precision, *nchars, *nbytes;
443 {
444 int len = SCHARS (string);
445 /* This set multibyte to 0 even if STRING is multibyte when it
446 contains only ascii and eight-bit-graphic, but that's
447 intentional. */
448 int multibyte = len < SBYTES (string);
449 unsigned char *str = SDATA (string);
450 int i = 0, i_byte = 0;
451 int width = 0;
452 struct Lisp_Char_Table *dp = buffer_display_table ();
453
454 while (i < len)
455 {
456 int chars, bytes, thiswidth;
457 Lisp_Object val;
458 int cmp_id;
459 EMACS_INT ignore, end;
460
461 if (find_composition (i, -1, &ignore, &end, &val, string)
462 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
463 >= 0))
464 {
465 thiswidth = composition_table[cmp_id]->width;
466 chars = end - i;
467 bytes = string_char_to_byte (string, end) - i_byte;
468 }
469 else
470 {
471 int c;
472
473 if (multibyte)
474 c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
475 else
476 c = str[i_byte], bytes = 1;
477 chars = 1;
478 if (dp)
479 {
480 val = DISP_CHAR_VECTOR (dp, c);
481 if (VECTORP (val))
482 thiswidth = XVECTOR (val)->size;
483 else
484 thiswidth = CHAR_WIDTH (c);
485 }
486 else
487 {
488 thiswidth = CHAR_WIDTH (c);
489 }
490 }
491
492 if (precision > 0
493 && (width + thiswidth > precision))
494 {
495 *nchars = i;
496 *nbytes = i_byte;
497 return width;
498 }
499 i += chars;
500 i_byte += bytes;
501 width += thiswidth;
502 }
503
504 if (precision > 0)
505 {
506 *nchars = i;
507 *nbytes = i_byte;
508 }
509
510 return width;
511 }
512
513 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
514 doc: /* Return width of STRING when displayed in the current buffer.
515 Width is measured by how many columns it occupies on the screen.
516 When calculating width of a multibyte character in STRING,
517 only the base leading-code is considered; the validity of
518 the following bytes is not checked. Tabs in STRING are always
519 taken to occupy `tab-width' columns. */)
520 (str)
521 Lisp_Object str;
522 {
523 Lisp_Object val;
524
525 CHECK_STRING (str);
526 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
527 return val;
528 }
529
530 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
531 doc: /* Return the direction of CHAR.
532 The returned value is 0 for left-to-right and 1 for right-to-left. */)
533 (ch)
534 Lisp_Object ch;
535 {
536 int c;
537
538 CHECK_CHARACTER (ch);
539 c = XINT (ch);
540 return CHAR_TABLE_REF (Vchar_direction_table, c);
541 }
542
543 /* Return the number of characters in the NBYTES bytes at PTR.
544 This works by looking at the contents and checking for multibyte
545 sequences while assuming that there's no invalid sequence.
546 However, if the current buffer has enable-multibyte-characters =
547 nil, we treat each byte as a character. */
548
549 int
550 chars_in_text (ptr, nbytes)
551 const unsigned char *ptr;
552 int nbytes;
553 {
554 /* current_buffer is null at early stages of Emacs initialization. */
555 if (current_buffer == 0
556 || NILP (current_buffer->enable_multibyte_characters))
557 return nbytes;
558
559 return multibyte_chars_in_text (ptr, nbytes);
560 }
561
562 /* Return the number of characters in the NBYTES bytes at PTR.
563 This works by looking at the contents and checking for multibyte
564 sequences while assuming that there's no invalid sequence. It
565 ignores enable-multibyte-characters. */
566
567 int
568 multibyte_chars_in_text (ptr, nbytes)
569 const unsigned char *ptr;
570 int nbytes;
571 {
572 const unsigned char *endp = ptr + nbytes;
573 int chars = 0;
574
575 while (ptr < endp)
576 {
577 int len = MULTIBYTE_LENGTH (ptr, endp);
578
579 if (len == 0)
580 abort ();
581 ptr += len;
582 chars++;
583 }
584
585 return chars;
586 }
587
588 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
589 characters and bytes in it, and store them in *NCHARS and *NBYTES
590 respectively. On counting bytes, pay attention to that 8-bit
591 characters not constructing a valid multibyte sequence are
592 represented by 2-byte in a multibyte text. */
593
594 void
595 parse_str_as_multibyte (str, len, nchars, nbytes)
596 const unsigned char *str;
597 int len, *nchars, *nbytes;
598 {
599 const unsigned char *endp = str + len;
600 int n, chars = 0, bytes = 0;
601
602 if (len >= MAX_MULTIBYTE_LENGTH)
603 {
604 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
605 while (str < adjusted_endp)
606 {
607 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
608 str += n, bytes += n;
609 else
610 str++, bytes += 2;
611 chars++;
612 }
613 }
614 while (str < endp)
615 {
616 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
617 str += n, bytes += n;
618 else
619 str++, bytes += 2;
620 chars++;
621 }
622
623 *nchars = chars;
624 *nbytes = bytes;
625 return;
626 }
627
628 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
629 It actually converts only such 8-bit characters that don't contruct
630 a multibyte sequence to multibyte forms of Latin-1 characters. If
631 NCHARS is nonzero, set *NCHARS to the number of characters in the
632 text. It is assured that we can use LEN bytes at STR as a work
633 area and that is enough. Return the number of bytes of the
634 resulting text. */
635
636 int
637 str_as_multibyte (str, len, nbytes, nchars)
638 unsigned char *str;
639 int len, nbytes, *nchars;
640 {
641 unsigned char *p = str, *endp = str + nbytes;
642 unsigned char *to;
643 int chars = 0;
644 int n;
645
646 if (nbytes >= MAX_MULTIBYTE_LENGTH)
647 {
648 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
649 while (p < adjusted_endp
650 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
651 p += n, chars++;
652 }
653 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
654 p += n, chars++;
655 if (nchars)
656 *nchars = chars;
657 if (p == endp)
658 return nbytes;
659
660 to = p;
661 nbytes = endp - p;
662 endp = str + len;
663 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
664 p = endp - nbytes;
665
666 if (nbytes >= MAX_MULTIBYTE_LENGTH)
667 {
668 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
669 while (p < adjusted_endp)
670 {
671 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
672 {
673 while (n--)
674 *to++ = *p++;
675 }
676 else
677 {
678 int c = *p++;
679 c = BYTE8_TO_CHAR (c);
680 to += CHAR_STRING (c, to);
681 }
682 }
683 chars++;
684 }
685 while (p < endp)
686 {
687 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
688 {
689 while (n--)
690 *to++ = *p++;
691 }
692 else
693 {
694 int c = *p++;
695 c = BYTE8_TO_CHAR (c);
696 to += CHAR_STRING (c, to);
697 }
698 chars++;
699 }
700 if (nchars)
701 *nchars = chars;
702 return (to - str);
703 }
704
705 /* Parse unibyte string at STR of LEN bytes, and return the number of
706 bytes it may ocupy when converted to multibyte string by
707 `str_to_multibyte'. */
708
709 int
710 parse_str_to_multibyte (str, len)
711 unsigned char *str;
712 int len;
713 {
714 unsigned char *endp = str + len;
715 int bytes;
716
717 for (bytes = 0; str < endp; str++)
718 bytes += (*str < 0x80) ? 1 : 2;
719 return bytes;
720 }
721
722
723 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
724 that contains the same single-byte characters. It actually
725 converts all 8-bit characters to multibyte forms. It is assured
726 that we can use LEN bytes at STR as a work area and that is
727 enough. */
728
729 int
730 str_to_multibyte (str, len, bytes)
731 unsigned char *str;
732 int len, bytes;
733 {
734 unsigned char *p = str, *endp = str + bytes;
735 unsigned char *to;
736
737 while (p < endp && *p < 0x80) p++;
738 if (p == endp)
739 return bytes;
740 to = p;
741 bytes = endp - p;
742 endp = str + len;
743 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
744 p = endp - bytes;
745 while (p < endp)
746 {
747 int c = *p++;
748
749 if (c >= 0x80)
750 c = BYTE8_TO_CHAR (c);
751 to += CHAR_STRING (c, to);
752 }
753 return (to - str);
754 }
755
756 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
757 actually converts characters in the range 0x80..0xFF to
758 unibyte. */
759
760 int
761 str_as_unibyte (str, bytes)
762 unsigned char *str;
763 int bytes;
764 {
765 const unsigned char *p = str, *endp = str + bytes;
766 unsigned char *to;
767 int c, len;
768
769 while (p < endp)
770 {
771 c = *p;
772 len = BYTES_BY_CHAR_HEAD (c);
773 if (CHAR_BYTE8_HEAD_P (c))
774 break;
775 p += len;
776 }
777 to = str + (p - str);
778 while (p < endp)
779 {
780 c = *p;
781 len = BYTES_BY_CHAR_HEAD (c);
782 if (CHAR_BYTE8_HEAD_P (c))
783 {
784 c = STRING_CHAR_ADVANCE (p);
785 *to++ = CHAR_TO_BYTE8 (c);
786 }
787 else
788 {
789 while (len--) *to++ = *p++;
790 }
791 }
792 return (to - str);
793 }
794
795 int
796 string_count_byte8 (string)
797 Lisp_Object string;
798 {
799 int multibyte = STRING_MULTIBYTE (string);
800 int nbytes = SBYTES (string);
801 unsigned char *p = SDATA (string);
802 unsigned char *pend = p + nbytes;
803 int count = 0;
804 int c, len;
805
806 if (multibyte)
807 while (p < pend)
808 {
809 c = *p;
810 len = BYTES_BY_CHAR_HEAD (c);
811
812 if (CHAR_BYTE8_HEAD_P (c))
813 count++;
814 p += len;
815 }
816 else
817 while (p < pend)
818 {
819 if (*p++ >= 0x80)
820 count++;
821 }
822 return count;
823 }
824
825
826 Lisp_Object
827 string_escape_byte8 (string)
828 Lisp_Object string;
829 {
830 int nchars = SCHARS (string);
831 int nbytes = SBYTES (string);
832 int multibyte = STRING_MULTIBYTE (string);
833 int byte8_count;
834 const unsigned char *src, *src_end;
835 unsigned char *dst;
836 Lisp_Object val;
837 int c, len;
838
839 if (multibyte && nchars == nbytes)
840 return string;
841
842 byte8_count = string_count_byte8 (string);
843
844 if (byte8_count == 0)
845 return string;
846
847 if (multibyte)
848 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
849 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
850 nbytes + byte8_count * 2);
851 else
852 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
853 val = make_uninit_string (nbytes + byte8_count * 3);
854
855 src = SDATA (string);
856 src_end = src + nbytes;
857 dst = SDATA (val);
858 if (multibyte)
859 while (src < src_end)
860 {
861 c = *src;
862 len = BYTES_BY_CHAR_HEAD (c);
863
864 if (CHAR_BYTE8_HEAD_P (c))
865 {
866 c = STRING_CHAR_ADVANCE (src);
867 c = CHAR_TO_BYTE8 (c);
868 sprintf ((char *) dst, "\\%03o", c);
869 dst += 4;
870 }
871 else
872 while (len--) *dst++ = *src++;
873 }
874 else
875 while (src < src_end)
876 {
877 c = *src++;
878 if (c >= 0x80)
879 {
880 sprintf ((char *) dst, "\\%03o", c);
881 dst += 4;
882 }
883 else
884 *dst++ = c;
885 }
886 return val;
887 }
888
889 \f
890 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
891 doc: /*
892 Concatenate all the argument characters and make the result a string.
893 usage: (string &rest CHARACTERS) */)
894 (n, args)
895 int n;
896 Lisp_Object *args;
897 {
898 int i;
899 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
900 unsigned char *p = buf;
901 int c;
902
903 for (i = 0; i < n; i++)
904 {
905 CHECK_CHARACTER (args[i]);
906 c = XINT (args[i]);
907 p += CHAR_STRING (c, p);
908 }
909
910 return make_string_from_bytes ((char *) buf, n, p - buf);
911 }
912
913 void
914 init_character_once ()
915 {
916 }
917
918 #ifdef emacs
919
920 void
921 syms_of_character ()
922 {
923 DEFSYM (Qcharacterp, "characterp");
924 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
925
926 staticpro (&Vchar_unify_table);
927 Vchar_unify_table = Qnil;
928
929 defsubr (&Smax_char);
930 defsubr (&Scharacterp);
931 defsubr (&Sunibyte_char_to_multibyte);
932 defsubr (&Smultibyte_char_to_unibyte);
933 defsubr (&Schar_bytes);
934 defsubr (&Schar_width);
935 defsubr (&Sstring_width);
936 defsubr (&Schar_direction);
937 defsubr (&Sstring);
938
939 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
940 doc: /*
941 Vector recording all translation tables ever defined.
942 Each element is a pair (SYMBOL . TABLE) relating the table to the
943 symbol naming it. The ID of a translation table is an index into this vector. */);
944 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
945
946 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
947 doc: /*
948 A char-table for characters which invoke auto-filling.
949 Such characters have value t in this table. */);
950 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
951 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
952 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
953
954 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
955 doc: /*
956 A char-table for width (columns) of each character. */);
957 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
958 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
959 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
960 make_number (4));
961
962 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
963 doc: /* A char-table for direction of each character. */);
964 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
965
966 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
967 doc: /* A char-table for each printable character. */);
968 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
969 Fset_char_table_range (Vprintable_chars,
970 Fcons (make_number (32), make_number (126)), Qt);
971 Fset_char_table_range (Vprintable_chars,
972 Fcons (make_number (160),
973 make_number (MAX_5_BYTE_CHAR)), Qt);
974
975 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
976 doc: /* Char table of script symbols.
977 It has one extra slot whose value is a list of script symbols. */);
978
979 /* Intern this now in case it isn't already done.
980 Setting this variable twice is harmless.
981 But don't staticpro it here--that is done in alloc.c. */
982 Qchar_table_extra_slots = intern ("char-table-extra-slots");
983 DEFSYM (Qchar_script_table, "char-script-table");
984 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
985 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
986 }
987
988 #endif /* emacs */
989
990 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
991 (do not change this comment) */