]> code.delx.au - gnu-emacs/blob - src/character.c
Revision: miles@gnu.org--gnu-2005/emacs--unicode--0--patch-69
[gnu-emacs] / src / character.c
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2005 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /* At first, see the document in `character.h' to understand the code
27 in this file. */
28
29 #ifdef emacs
30 #include <config.h>
31 #endif
32
33 #include <stdio.h>
34
35 #ifdef emacs
36
37 #include <sys/types.h>
38 #include "lisp.h"
39 #include "character.h"
40 #include "buffer.h"
41 #include "charset.h"
42 #include "composite.h"
43 #include "disptab.h"
44
45 #else /* not emacs */
46
47 #include "mulelib.h"
48
49 #endif /* emacs */
50
51 Lisp_Object Qcharacterp;
52
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector;
56
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars;
59
60 Lisp_Object Qauto_fill_chars;
61
62 /* Char-table of information about which character to unify to which
63 Unicode character. */
64 Lisp_Object Vchar_unify_table;
65
66 /* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68 Lisp_Object Vprintable_chars;
69
70 /* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72 Lisp_Object Vchar_width_table;
73
74 /* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76 Lisp_Object Vchar_direction_table;
77
78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
79 unsigned char *_fetch_multibyte_char_p;
80
81 /* Char table of scripts. */
82 Lisp_Object Vchar_script_table;
83
84 static Lisp_Object Qchar_script_table;
85
86 /* Mapping table from unibyte chars to multibyte chars. */
87 int unibyte_to_multibyte_table[256];
88
89 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
90 char. */
91 char unibyte_has_multibyte_table[256];
92
93 \f
94
95 /* Store multibyte form of character C at P. If C has modifier bits,
96 handle them appropriately. */
97
98 int
99 char_string (c, p)
100 int c;
101 unsigned char *p;
102 {
103 int bytes;
104
105 if (c & CHAR_MODIFIER_MASK)
106 {
107 /* As an non-ASCII character can't have modifier bits, we just
108 ignore the bits. */
109 if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
110 {
111 /* For Meta, Shift, and Control modifiers, we need special care. */
112 if (c & CHAR_META)
113 {
114 /* Move the meta bit to the right place for a string. */
115 c = (c & ~CHAR_META) | 0x80;
116 }
117 if (c & CHAR_SHIFT)
118 {
119 /* Shift modifier is valid only with [A-Za-z]. */
120 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
121 c &= ~CHAR_SHIFT;
122 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
123 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
124 }
125 if (c & CHAR_CTL)
126 {
127 /* Simulate the code in lread.c. */
128 /* Allow `\C- ' and `\C-?'. */
129 if (c == (CHAR_CTL | ' '))
130 c = 0;
131 else if (c == (CHAR_CTL | '?'))
132 c = 127;
133 /* ASCII control chars are made from letters (both cases),
134 as well as the non-letters within 0100...0137. */
135 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
136 c &= (037 | (~0177 & ~CHAR_CTL));
137 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
138 c &= (037 | (~0177 & ~CHAR_CTL));
139 }
140 }
141
142 /* If C still has any modifier bits, just ignore it. */
143 c &= ~CHAR_MODIFIER_MASK;
144 }
145
146 MAYBE_UNIFY_CHAR (c);
147
148 if (c <= MAX_3_BYTE_CHAR)
149 {
150 bytes = CHAR_STRING (c, p);
151 }
152 else if (c <= MAX_4_BYTE_CHAR)
153 {
154 p[0] = (0xF0 | (c >> 18));
155 p[1] = (0x80 | ((c >> 12) & 0x3F));
156 p[2] = (0x80 | ((c >> 6) & 0x3F));
157 p[3] = (0x80 | (c & 0x3F));
158 bytes = 4;
159 }
160 else if (c <= MAX_5_BYTE_CHAR)
161 {
162 p[0] = 0xF8;
163 p[1] = (0x80 | ((c >> 18) & 0x0F));
164 p[2] = (0x80 | ((c >> 12) & 0x3F));
165 p[3] = (0x80 | ((c >> 6) & 0x3F));
166 p[4] = (0x80 | (c & 0x3F));
167 bytes = 5;
168 }
169 else
170 {
171 c = CHAR_TO_BYTE8 (c);
172 bytes = BYTE8_STRING (c, p);
173 }
174
175 return bytes;
176 }
177
178
179 /* Return a character whose multibyte form is at P. Set LEN is not
180 NULL, it must be a pointer to integer. In that case, set *LEN to
181 the byte length of the multibyte form. If ADVANCED is not NULL, is
182 must be a pointer to unsigned char. In that case, set *ADVANCED to
183 the ending address (i.e. the starting address of the next
184 character) of the multibyte form. */
185
186 int
187 string_char (p, advanced, len)
188 const unsigned char *p;
189 const unsigned char **advanced;
190 int *len;
191 {
192 int c;
193 const unsigned char *saved_p = p;
194
195 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
196 {
197 c = STRING_CHAR_ADVANCE (p);
198 }
199 else if (! (*p & 0x08))
200 {
201 c = ((((p)[0] & 0xF) << 18)
202 | (((p)[1] & 0x3F) << 12)
203 | (((p)[2] & 0x3F) << 6)
204 | ((p)[3] & 0x3F));
205 p += 4;
206 }
207 else
208 {
209 c = ((((p)[1] & 0x3F) << 18)
210 | (((p)[2] & 0x3F) << 12)
211 | (((p)[3] & 0x3F) << 6)
212 | ((p)[4] & 0x3F));
213 p += 5;
214 }
215
216 MAYBE_UNIFY_CHAR (c);
217
218 if (len)
219 *len = p - saved_p;
220 if (advanced)
221 *advanced = p;
222 return c;
223 }
224
225
226 /* Translate character C by translation table TABLE. If C is
227 negative, translate a character specified by CHARSET and CODE. If
228 no translation is found in TABLE, return the untranslated
229 character. If TABLE is a list, elements are char tables. In this
230 case, translace C by all tables. */
231
232 int
233 translate_char (table, c)
234 Lisp_Object table;
235 int c;
236 {
237 if (CHAR_TABLE_P (table))
238 {
239 Lisp_Object ch;
240
241 ch = CHAR_TABLE_REF (table, c);
242 if (CHARACTERP (ch))
243 c = XINT (ch);
244 }
245 else
246 {
247 for (; CONSP (table); table = XCDR (table))
248 c = translate_char (XCAR (table), c);
249 }
250 return c;
251 }
252
253 /* Convert the multibyte character C to unibyte 8-bit character based
254 on the current value of charset_unibyte. If dimension of
255 charset_unibyte is more than one, return (C & 0xFF).
256
257 The argument REV_TBL is now ignored. It will be removed in the
258 future. */
259
260 int
261 multibyte_char_to_unibyte (c, rev_tbl)
262 int c;
263 Lisp_Object rev_tbl;
264 {
265 struct charset *charset;
266 unsigned c1;
267
268 if (CHAR_BYTE8_P (c))
269 return CHAR_TO_BYTE8 (c);
270 charset = CHARSET_FROM_ID (charset_unibyte);
271 c1 = ENCODE_CHAR (charset, c);
272 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
273 }
274
275
276 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
277 doc: /* Return non-nil if OBJECT is a character. */)
278 (object, ignore)
279 Lisp_Object object, ignore;
280 {
281 return (CHARACTERP (object) ? Qt : Qnil);
282 }
283
284 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
285 doc: /* Return the character of the maximum code. */)
286 ()
287 {
288 return make_number (MAX_CHAR);
289 }
290
291 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
292 Sunibyte_char_to_multibyte, 1, 1, 0,
293 doc: /* Convert the unibyte character CH to multibyte character.
294 The multibyte character is a result of decoding CH by
295 the current unibyte charset (see `unibyte-charset'). */)
296 (ch)
297 Lisp_Object ch;
298 {
299 int c;
300 struct charset *charset;
301
302 CHECK_CHARACTER (ch);
303 c = XFASTINT (ch);
304 if (c >= 0400)
305 error ("Invalid unibyte character: %d", c);
306 charset = CHARSET_FROM_ID (charset_unibyte);
307 c = DECODE_CHAR (charset, c);
308 if (c < 0)
309 c = BYTE8_TO_CHAR (XFASTINT (ch));
310 return make_number (c);
311 }
312
313 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
314 Smultibyte_char_to_unibyte, 1, 1, 0,
315 doc: /* Convert the multibyte character CH to unibyte character.\n\
316 The unibyte character is a result of encoding CH by
317 the current primary charset (value of `charset-primary'). */)
318 (ch)
319 Lisp_Object ch;
320 {
321 int c;
322
323 CHECK_CHARACTER (ch);
324 c = XFASTINT (ch);
325 c = CHAR_TO_BYTE8 (c);
326 return make_number (c);
327 }
328
329 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
330 doc: /* Return 1 regardless of the argument CHAR.
331 This is now an obsolete function. We keep it just for backward compatibility. */)
332 (ch)
333 Lisp_Object ch;
334 {
335 CHECK_CHARACTER (ch);
336 return make_number (1);
337 }
338
339 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
340 doc: /* Return width of CHAR when displayed in the current buffer.
341 The width is measured by how many columns it occupies on the screen.
342 Tab is taken to occupy `tab-width' columns. */)
343 (ch)
344 Lisp_Object ch;
345 {
346 Lisp_Object disp;
347 int c, width;
348 struct Lisp_Char_Table *dp = buffer_display_table ();
349
350 CHECK_CHARACTER (ch);
351 c = XINT (ch);
352
353 /* Get the way the display table would display it. */
354 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
355
356 if (VECTORP (disp))
357 width = ASIZE (disp);
358 else
359 width = CHAR_WIDTH (c);
360
361 return make_number (width);
362 }
363
364 /* Return width of string STR of length LEN when displayed in the
365 current buffer. The width is measured by how many columns it
366 occupies on the screen. If PRECISION > 0, return the width of
367 longest substring that doesn't exceed PRECISION, and set number of
368 characters and bytes of the substring in *NCHARS and *NBYTES
369 respectively. */
370
371 int
372 c_string_width (str, len, precision, nchars, nbytes)
373 const unsigned char *str;
374 int precision, *nchars, *nbytes;
375 {
376 int i = 0, i_byte = 0;
377 int width = 0;
378 struct Lisp_Char_Table *dp = buffer_display_table ();
379
380 while (i_byte < len)
381 {
382 int bytes, thiswidth;
383 Lisp_Object val;
384 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
385
386 if (dp)
387 {
388 val = DISP_CHAR_VECTOR (dp, c);
389 if (VECTORP (val))
390 thiswidth = XVECTOR (val)->size;
391 else
392 thiswidth = CHAR_WIDTH (c);
393 }
394 else
395 {
396 thiswidth = CHAR_WIDTH (c);
397 }
398
399 if (precision > 0
400 && (width + thiswidth > precision))
401 {
402 *nchars = i;
403 *nbytes = i_byte;
404 return width;
405 }
406 i++;
407 i_byte += bytes;
408 width += thiswidth;
409 }
410
411 if (precision > 0)
412 {
413 *nchars = i;
414 *nbytes = i_byte;
415 }
416
417 return width;
418 }
419
420 /* Return width of string STR of length LEN when displayed in the
421 current buffer. The width is measured by how many columns it
422 occupies on the screen. */
423
424 int
425 strwidth (str, len)
426 unsigned char *str;
427 int len;
428 {
429 return c_string_width (str, len, -1, NULL, NULL);
430 }
431
432 /* Return width of Lisp string STRING when displayed in the current
433 buffer. The width is measured by how many columns it occupies on
434 the screen while paying attention to compositions. If PRECISION >
435 0, return the width of longest substring that doesn't exceed
436 PRECISION, and set number of characters and bytes of the substring
437 in *NCHARS and *NBYTES respectively. */
438
439 int
440 lisp_string_width (string, precision, nchars, nbytes)
441 Lisp_Object string;
442 int precision, *nchars, *nbytes;
443 {
444 int len = SCHARS (string);
445 unsigned char *str = SDATA (string);
446 int i = 0, i_byte = 0;
447 int width = 0;
448 struct Lisp_Char_Table *dp = buffer_display_table ();
449
450 while (i < len)
451 {
452 int chars, bytes, thiswidth;
453 Lisp_Object val;
454 int cmp_id;
455 EMACS_INT ignore, end;
456
457 if (find_composition (i, -1, &ignore, &end, &val, string)
458 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
459 >= 0))
460 {
461 thiswidth = composition_table[cmp_id]->width;
462 chars = end - i;
463 bytes = string_char_to_byte (string, end) - i_byte;
464 }
465 else if (dp)
466 {
467 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
468
469 chars = 1;
470 val = DISP_CHAR_VECTOR (dp, c);
471 if (VECTORP (val))
472 thiswidth = XVECTOR (val)->size;
473 else
474 thiswidth = CHAR_WIDTH (c);
475 }
476 else
477 {
478 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
479
480 chars = 1;
481 thiswidth = CHAR_WIDTH (c);
482 }
483
484 if (precision > 0
485 && (width + thiswidth > precision))
486 {
487 *nchars = i;
488 *nbytes = i_byte;
489 return width;
490 }
491 i += chars;
492 i_byte += bytes;
493 width += thiswidth;
494 }
495
496 if (precision > 0)
497 {
498 *nchars = i;
499 *nbytes = i_byte;
500 }
501
502 return width;
503 }
504
505 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
506 doc: /* Return width of STRING when displayed in the current buffer.
507 Width is measured by how many columns it occupies on the screen.
508 When calculating width of a multibyte character in STRING,
509 only the base leading-code is considered; the validity of
510 the following bytes is not checked. Tabs in STRING are always
511 taken to occupy `tab-width' columns. */)
512 (str)
513 Lisp_Object str;
514 {
515 Lisp_Object val;
516
517 CHECK_STRING (str);
518 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
519 return val;
520 }
521
522 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
523 doc: /* Return the direction of CHAR.
524 The returned value is 0 for left-to-right and 1 for right-to-left. */)
525 (ch)
526 Lisp_Object ch;
527 {
528 int c;
529
530 CHECK_CHARACTER (ch);
531 c = XINT (ch);
532 return CHAR_TABLE_REF (Vchar_direction_table, c);
533 }
534
535 /* Return the number of characters in the NBYTES bytes at PTR.
536 This works by looking at the contents and checking for multibyte
537 sequences while assuming that there's no invalid sequence.
538 However, if the current buffer has enable-multibyte-characters =
539 nil, we treat each byte as a character. */
540
541 int
542 chars_in_text (ptr, nbytes)
543 const unsigned char *ptr;
544 int nbytes;
545 {
546 /* current_buffer is null at early stages of Emacs initialization. */
547 if (current_buffer == 0
548 || NILP (current_buffer->enable_multibyte_characters))
549 return nbytes;
550
551 return multibyte_chars_in_text (ptr, nbytes);
552 }
553
554 /* Return the number of characters in the NBYTES bytes at PTR.
555 This works by looking at the contents and checking for multibyte
556 sequences while assuming that there's no invalid sequence. It
557 ignores enable-multibyte-characters. */
558
559 int
560 multibyte_chars_in_text (ptr, nbytes)
561 const unsigned char *ptr;
562 int nbytes;
563 {
564 const unsigned char *endp = ptr + nbytes;
565 int chars = 0;
566
567 while (ptr < endp)
568 {
569 int len = MULTIBYTE_LENGTH (ptr, endp);
570
571 if (len == 0)
572 abort ();
573 ptr += len;
574 chars++;
575 }
576
577 return chars;
578 }
579
580 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
581 characters and bytes in it, and store them in *NCHARS and *NBYTES
582 respectively. On counting bytes, pay attention to that 8-bit
583 characters not constructing a valid multibyte sequence are
584 represented by 2-byte in a multibyte text. */
585
586 void
587 parse_str_as_multibyte (str, len, nchars, nbytes)
588 const unsigned char *str;
589 int len, *nchars, *nbytes;
590 {
591 const unsigned char *endp = str + len;
592 int n, chars = 0, bytes = 0;
593
594 if (len >= MAX_MULTIBYTE_LENGTH)
595 {
596 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
597 while (str < adjusted_endp)
598 {
599 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
600 str += n, bytes += n;
601 else
602 str++, bytes += 2;
603 chars++;
604 }
605 }
606 while (str < endp)
607 {
608 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
609 str += n, bytes += n;
610 else
611 str++, bytes += 2;
612 chars++;
613 }
614
615 *nchars = chars;
616 *nbytes = bytes;
617 return;
618 }
619
620 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
621 It actually converts only such 8-bit characters that don't contruct
622 a multibyte sequence to multibyte forms of Latin-1 characters. If
623 NCHARS is nonzero, set *NCHARS to the number of characters in the
624 text. It is assured that we can use LEN bytes at STR as a work
625 area and that is enough. Return the number of bytes of the
626 resulting text. */
627
628 int
629 str_as_multibyte (str, len, nbytes, nchars)
630 unsigned char *str;
631 int len, nbytes, *nchars;
632 {
633 unsigned char *p = str, *endp = str + nbytes;
634 unsigned char *to;
635 int chars = 0;
636 int n;
637
638 if (nbytes >= MAX_MULTIBYTE_LENGTH)
639 {
640 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
641 while (p < adjusted_endp
642 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
643 p += n, chars++;
644 }
645 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
646 p += n, chars++;
647 if (nchars)
648 *nchars = chars;
649 if (p == endp)
650 return nbytes;
651
652 to = p;
653 nbytes = endp - p;
654 endp = str + len;
655 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
656 p = endp - nbytes;
657
658 if (nbytes >= MAX_MULTIBYTE_LENGTH)
659 {
660 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
661 while (p < adjusted_endp)
662 {
663 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
664 {
665 while (n--)
666 *to++ = *p++;
667 }
668 else
669 {
670 int c = *p++;
671 c = BYTE8_TO_CHAR (c);
672 to += CHAR_STRING (c, to);
673 }
674 }
675 chars++;
676 }
677 while (p < endp)
678 {
679 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
680 {
681 while (n--)
682 *to++ = *p++;
683 }
684 else
685 {
686 int c = *p++;
687 c = BYTE8_TO_CHAR (c);
688 to += CHAR_STRING (c, to);
689 }
690 chars++;
691 }
692 if (nchars)
693 *nchars = chars;
694 return (to - str);
695 }
696
697 /* Parse unibyte string at STR of LEN bytes, and return the number of
698 bytes it may ocupy when converted to multibyte string by
699 `str_to_multibyte'. */
700
701 int
702 parse_str_to_multibyte (str, len)
703 unsigned char *str;
704 int len;
705 {
706 unsigned char *endp = str + len;
707 int bytes;
708
709 for (bytes = 0; str < endp; str++)
710 bytes += (*str < 0x80) ? 1 : 2;
711 return bytes;
712 }
713
714
715 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
716 that contains the same single-byte characters. It actually
717 converts all 8-bit characters to multibyte forms. It is assured
718 that we can use LEN bytes at STR as a work area and that is
719 enough. */
720
721 int
722 str_to_multibyte (str, len, bytes)
723 unsigned char *str;
724 int len, bytes;
725 {
726 unsigned char *p = str, *endp = str + bytes;
727 unsigned char *to;
728
729 while (p < endp && *p < 0x80) p++;
730 if (p == endp)
731 return bytes;
732 to = p;
733 bytes = endp - p;
734 endp = str + len;
735 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
736 p = endp - bytes;
737 while (p < endp)
738 {
739 int c = *p++;
740
741 if (c >= 0x80)
742 c = BYTE8_TO_CHAR (c);
743 to += CHAR_STRING (c, to);
744 }
745 return (to - str);
746 }
747
748 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
749 actually converts characters in the range 0x80..0xFF to
750 unibyte. */
751
752 int
753 str_as_unibyte (str, bytes)
754 unsigned char *str;
755 int bytes;
756 {
757 const unsigned char *p = str, *endp = str + bytes;
758 unsigned char *to;
759 int c, len;
760
761 while (p < endp)
762 {
763 c = *p;
764 len = BYTES_BY_CHAR_HEAD (c);
765 if (CHAR_BYTE8_HEAD_P (c))
766 break;
767 p += len;
768 }
769 to = str + (p - str);
770 while (p < endp)
771 {
772 c = *p;
773 len = BYTES_BY_CHAR_HEAD (c);
774 if (CHAR_BYTE8_HEAD_P (c))
775 {
776 c = STRING_CHAR_ADVANCE (p);
777 *to++ = CHAR_TO_BYTE8 (c);
778 }
779 else
780 {
781 while (len--) *to++ = *p++;
782 }
783 }
784 return (to - str);
785 }
786
787 int
788 string_count_byte8 (string)
789 Lisp_Object string;
790 {
791 int multibyte = STRING_MULTIBYTE (string);
792 int nbytes = SBYTES (string);
793 unsigned char *p = SDATA (string);
794 unsigned char *pend = p + nbytes;
795 int count = 0;
796 int c, len;
797
798 if (multibyte)
799 while (p < pend)
800 {
801 c = *p;
802 len = BYTES_BY_CHAR_HEAD (c);
803
804 if (CHAR_BYTE8_HEAD_P (c))
805 count++;
806 p += len;
807 }
808 else
809 while (p < pend)
810 {
811 if (*p++ >= 0x80)
812 count++;
813 }
814 return count;
815 }
816
817
818 Lisp_Object
819 string_escape_byte8 (string)
820 Lisp_Object string;
821 {
822 int nchars = SCHARS (string);
823 int nbytes = SBYTES (string);
824 int multibyte = STRING_MULTIBYTE (string);
825 int byte8_count;
826 const unsigned char *src, *src_end;
827 unsigned char *dst;
828 Lisp_Object val;
829 int c, len;
830
831 if (multibyte && nchars == nbytes)
832 return string;
833
834 byte8_count = string_count_byte8 (string);
835
836 if (byte8_count == 0)
837 return string;
838
839 if (multibyte)
840 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
841 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
842 nbytes + byte8_count * 2);
843 else
844 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
845 val = make_uninit_string (nbytes + byte8_count * 3);
846
847 src = SDATA (string);
848 src_end = src + nbytes;
849 dst = SDATA (val);
850 if (multibyte)
851 while (src < src_end)
852 {
853 c = *src;
854 len = BYTES_BY_CHAR_HEAD (c);
855
856 if (CHAR_BYTE8_HEAD_P (c))
857 {
858 c = STRING_CHAR_ADVANCE (src);
859 c = CHAR_TO_BYTE8 (c);
860 sprintf ((char *) dst, "\\%03o", c);
861 dst += 4;
862 }
863 else
864 while (len--) *dst++ = *src++;
865 }
866 else
867 while (src < src_end)
868 {
869 c = *src++;
870 if (c >= 0x80)
871 {
872 sprintf ((char *) dst, "\\%03o", c);
873 dst += 4;
874 }
875 else
876 *dst++ = c;
877 }
878 return val;
879 }
880
881 \f
882 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
883 doc: /*
884 Concatenate all the argument characters and make the result a string.
885 usage: (string &rest CHARACTERS) */)
886 (n, args)
887 int n;
888 Lisp_Object *args;
889 {
890 int i;
891 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
892 unsigned char *p = buf;
893 int c;
894
895 for (i = 0; i < n; i++)
896 {
897 CHECK_CHARACTER (args[i]);
898 c = XINT (args[i]);
899 p += CHAR_STRING (c, p);
900 }
901
902 return make_string_from_bytes ((char *) buf, n, p - buf);
903 }
904
905 void
906 init_character_once ()
907 {
908 }
909
910 #ifdef emacs
911
912 void
913 syms_of_character ()
914 {
915 DEFSYM (Qcharacterp, "characterp");
916 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
917
918 staticpro (&Vchar_unify_table);
919 Vchar_unify_table = Qnil;
920
921 defsubr (&Smax_char);
922 defsubr (&Scharacterp);
923 defsubr (&Sunibyte_char_to_multibyte);
924 defsubr (&Smultibyte_char_to_unibyte);
925 defsubr (&Schar_bytes);
926 defsubr (&Schar_width);
927 defsubr (&Sstring_width);
928 defsubr (&Schar_direction);
929 defsubr (&Sstring);
930
931 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
932 doc: /*
933 Vector recording all translation tables ever defined.
934 Each element is a pair (SYMBOL . TABLE) relating the table to the
935 symbol naming it. The ID of a translation table is an index into this vector. */);
936 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
937
938 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
939 doc: /*
940 A char-table for characters which invoke auto-filling.
941 Such characters have value t in this table. */);
942 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
943 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
944 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
945
946 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
947 doc: /*
948 A char-table for width (columns) of each character. */);
949 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
950 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
951 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
952 make_number (4));
953
954 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
955 doc: /* A char-table for direction of each character. */);
956 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
957
958 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
959 doc: /* A char-table for each printable character. */);
960 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
961 Fset_char_table_range (Vprintable_chars,
962 Fcons (make_number (32), make_number (126)), Qt);
963 Fset_char_table_range (Vprintable_chars,
964 Fcons (make_number (160),
965 make_number (MAX_5_BYTE_CHAR)), Qt);
966
967 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
968 doc: /* Char table of script symbols.
969 It has one extra slot whose value is a list of script symbols. */);
970
971 /* Intern this now in case it isn't already done.
972 Setting this variable twice is harmless.
973 But don't staticpro it here--that is done in alloc.c. */
974 Qchar_table_extra_slots = intern ("char-table-extra-slots");
975 DEFSYM (Qchar_script_table, "char-script-table");
976 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
977 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
978 }
979
980 #endif /* emacs */
981
982 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
983 (do not change this comment) */