]> code.delx.au - gnu-emacs/blob - src/character.c
Revision: miles@gnu.org--gnu-2004/emacs--unicode--0--patch-17
[gnu-emacs] / src / character.c
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /* At first, see the document in `character.h' to understand the code
27 in this file. */
28
29 #ifdef emacs
30 #include <config.h>
31 #endif
32
33 #include <stdio.h>
34
35 #ifdef emacs
36
37 #include <sys/types.h>
38 #include "lisp.h"
39 #include "character.h"
40 #include "buffer.h"
41 #include "charset.h"
42 #include "composite.h"
43 #include "disptab.h"
44
45 #else /* not emacs */
46
47 #include "mulelib.h"
48
49 #endif /* emacs */
50
51 Lisp_Object Qcharacterp;
52
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector;
56
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars;
59
60 Lisp_Object Qauto_fill_chars;
61
62 /* Char-table of information about which character to unify to which
63 Unicode character. */
64 Lisp_Object Vchar_unify_table;
65
66 /* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68 Lisp_Object Vprintable_chars;
69
70 /* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72 Lisp_Object Vchar_width_table;
73
74 /* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76 Lisp_Object Vchar_direction_table;
77
78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
79 unsigned char *_fetch_multibyte_char_p;
80
81 /* Char table of scripts. */
82 Lisp_Object Vchar_script_table;
83
84 static Lisp_Object Qchar_script_table;
85
86 /* Mapping table from unibyte chars to multibyte chars. */
87 int unibyte_to_multibyte_table[256];
88
89 \f
90
91 /* Store multibyte form of character C at P. If C has modifier bits,
92 handle them appropriately. */
93
94 int
95 char_string (c, p)
96 int c;
97 unsigned char *p;
98 {
99 int bytes;
100
101 if (c & CHAR_MODIFIER_MASK)
102 {
103 /* As an non-ASCII character can't have modifier bits, we just
104 ignore the bits. */
105 if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
106 {
107 /* For Meta, Shift, and Control modifiers, we need special care. */
108 if (c & CHAR_META)
109 {
110 /* Move the meta bit to the right place for a string. */
111 c = (c & ~CHAR_META) | 0x80;
112 }
113 if (c & CHAR_SHIFT)
114 {
115 /* Shift modifier is valid only with [A-Za-z]. */
116 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
117 c &= ~CHAR_SHIFT;
118 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
119 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
120 }
121 if (c & CHAR_CTL)
122 {
123 /* Simulate the code in lread.c. */
124 /* Allow `\C- ' and `\C-?'. */
125 if (c == (CHAR_CTL | ' '))
126 c = 0;
127 else if (c == (CHAR_CTL | '?'))
128 c = 127;
129 /* ASCII control chars are made from letters (both cases),
130 as well as the non-letters within 0100...0137. */
131 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
132 c &= (037 | (~0177 & ~CHAR_CTL));
133 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
134 c &= (037 | (~0177 & ~CHAR_CTL));
135 }
136 }
137
138 /* If C still has any modifier bits, just ignore it. */
139 c &= ~CHAR_MODIFIER_MASK;
140 }
141
142 MAYBE_UNIFY_CHAR (c);
143
144 if (c <= MAX_3_BYTE_CHAR)
145 {
146 bytes = CHAR_STRING (c, p);
147 }
148 else if (c <= MAX_4_BYTE_CHAR)
149 {
150 p[0] = (0xF0 | (c >> 18));
151 p[1] = (0x80 | ((c >> 12) & 0x3F));
152 p[2] = (0x80 | ((c >> 6) & 0x3F));
153 p[3] = (0x80 | (c & 0x3F));
154 bytes = 4;
155 }
156 else if (c <= MAX_5_BYTE_CHAR)
157 {
158 p[0] = 0xF8;
159 p[1] = (0x80 | ((c >> 18) & 0x0F));
160 p[2] = (0x80 | ((c >> 12) & 0x3F));
161 p[3] = (0x80 | ((c >> 6) & 0x3F));
162 p[4] = (0x80 | (c & 0x3F));
163 bytes = 5;
164 }
165 else
166 {
167 c = CHAR_TO_BYTE8 (c);
168 bytes = BYTE8_STRING (c, p);
169 }
170
171 return bytes;
172 }
173
174
175 /* Return a character whose multibyte form is at P. Set LEN is not
176 NULL, it must be a pointer to integer. In that case, set *LEN to
177 the byte length of the multibyte form. If ADVANCED is not NULL, is
178 must be a pointer to unsigned char. In that case, set *ADVANCED to
179 the ending address (i.e. the starting address of the next
180 character) of the multibyte form. */
181
182 int
183 string_char (p, advanced, len)
184 const unsigned char *p;
185 const unsigned char **advanced;
186 int *len;
187 {
188 int c;
189 const unsigned char *saved_p = p;
190
191 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
192 {
193 c = STRING_CHAR_ADVANCE (p);
194 }
195 else if (! (*p & 0x08))
196 {
197 c = ((((p)[0] & 0xF) << 18)
198 | (((p)[1] & 0x3F) << 12)
199 | (((p)[2] & 0x3F) << 6)
200 | ((p)[3] & 0x3F));
201 p += 4;
202 }
203 else
204 {
205 c = ((((p)[1] & 0x3F) << 18)
206 | (((p)[2] & 0x3F) << 12)
207 | (((p)[3] & 0x3F) << 6)
208 | ((p)[4] & 0x3F));
209 p += 5;
210 }
211
212 MAYBE_UNIFY_CHAR (c);
213
214 if (len)
215 *len = p - saved_p;
216 if (advanced)
217 *advanced = p;
218 return c;
219 }
220
221
222 /* Translate character C by translation table TABLE. If C is
223 negative, translate a character specified by CHARSET and CODE. If
224 no translation is found in TABLE, return the untranslated
225 character. If TABLE is a list, elements are char tables. In this
226 case, translace C by all tables. */
227
228 int
229 translate_char (table, c)
230 Lisp_Object table;
231 int c;
232 {
233 if (CHAR_TABLE_P (table))
234 {
235 Lisp_Object ch;
236
237 ch = CHAR_TABLE_REF (table, c);
238 if (CHARACTERP (ch))
239 c = XINT (ch);
240 }
241 else
242 {
243 for (; CONSP (table); table = XCDR (table))
244 c = translate_char (XCAR (table), c);
245 }
246 return c;
247 }
248
249 /* Convert the multibyte character C to unibyte 8-bit character based
250 on the current value of charset_unibyte. If dimension of
251 charset_unibyte is more than one, return (C & 0xFF).
252
253 The argument REV_TBL is now ignored. It will be removed in the
254 future. */
255
256 int
257 multibyte_char_to_unibyte (c, rev_tbl)
258 int c;
259 Lisp_Object rev_tbl;
260 {
261 struct charset *charset;
262 unsigned c1;
263
264 if (CHAR_BYTE8_P (c))
265 return CHAR_TO_BYTE8 (c);
266 charset = CHARSET_FROM_ID (charset_unibyte);
267 c1 = ENCODE_CHAR (charset, c);
268 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
269 }
270
271
272 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
273 doc: /* Return non-nil if OBJECT is a character. */)
274 (object, ignore)
275 Lisp_Object object, ignore;
276 {
277 return (CHARACTERP (object) ? Qt : Qnil);
278 }
279
280 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
281 doc: /* Return the character of the maximum code. */)
282 ()
283 {
284 return make_number (MAX_CHAR);
285 }
286
287 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
288 Sunibyte_char_to_multibyte, 1, 1, 0,
289 doc: /* Convert the unibyte character CH to multibyte character.
290 The multibyte character is a result of decoding CH by
291 the current unibyte charset (see `unibyte-charset'). */)
292 (ch)
293 Lisp_Object ch;
294 {
295 int c;
296 struct charset *charset;
297
298 CHECK_CHARACTER (ch);
299 c = XFASTINT (ch);
300 if (c >= 0400)
301 error ("Invalid unibyte character: %d", c);
302 charset = CHARSET_FROM_ID (charset_unibyte);
303 c = DECODE_CHAR (charset, c);
304 if (c < 0)
305 c = BYTE8_TO_CHAR (XFASTINT (ch));
306 return make_number (c);
307 }
308
309 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
310 Smultibyte_char_to_unibyte, 1, 1, 0,
311 doc: /* Convert the multibyte character CH to unibyte character.\n\
312 The unibyte character is a result of encoding CH by
313 the current primary charset (value of `charset-primary'). */)
314 (ch)
315 Lisp_Object ch;
316 {
317 int c;
318
319 CHECK_CHARACTER (ch);
320 c = XFASTINT (ch);
321 c = CHAR_TO_BYTE8 (c);
322 return make_number (c);
323 }
324
325 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
326 doc: /* Return 1 regardless of the argument CHAR.
327 This is now an obsolete function. We keep it just for backward compatibility. */)
328 (ch)
329 Lisp_Object ch;
330 {
331 CHECK_CHARACTER (ch);
332 return make_number (1);
333 }
334
335 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
336 doc: /* Return width of CHAR when displayed in the current buffer.
337 The width is measured by how many columns it occupies on the screen.
338 Tab is taken to occupy `tab-width' columns. */)
339 (ch)
340 Lisp_Object ch;
341 {
342 Lisp_Object disp;
343 int c, width;
344 struct Lisp_Char_Table *dp = buffer_display_table ();
345
346 CHECK_CHARACTER (ch);
347 c = XINT (ch);
348
349 /* Get the way the display table would display it. */
350 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
351
352 if (VECTORP (disp))
353 width = ASIZE (disp);
354 else
355 width = CHAR_WIDTH (c);
356
357 return make_number (width);
358 }
359
360 /* Return width of string STR of length LEN when displayed in the
361 current buffer. The width is measured by how many columns it
362 occupies on the screen. If PRECISION > 0, return the width of
363 longest substring that doesn't exceed PRECISION, and set number of
364 characters and bytes of the substring in *NCHARS and *NBYTES
365 respectively. */
366
367 int
368 c_string_width (str, len, precision, nchars, nbytes)
369 const unsigned char *str;
370 int precision, *nchars, *nbytes;
371 {
372 int i = 0, i_byte = 0;
373 int width = 0;
374 struct Lisp_Char_Table *dp = buffer_display_table ();
375
376 while (i_byte < len)
377 {
378 int bytes, thiswidth;
379 Lisp_Object val;
380 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
381
382 if (dp)
383 {
384 val = DISP_CHAR_VECTOR (dp, c);
385 if (VECTORP (val))
386 thiswidth = XVECTOR (val)->size;
387 else
388 thiswidth = CHAR_WIDTH (c);
389 }
390 else
391 {
392 thiswidth = CHAR_WIDTH (c);
393 }
394
395 if (precision > 0
396 && (width + thiswidth > precision))
397 {
398 *nchars = i;
399 *nbytes = i_byte;
400 return width;
401 }
402 i++;
403 i_byte += bytes;
404 width += thiswidth;
405 }
406
407 if (precision > 0)
408 {
409 *nchars = i;
410 *nbytes = i_byte;
411 }
412
413 return width;
414 }
415
416 /* Return width of string STR of length LEN when displayed in the
417 current buffer. The width is measured by how many columns it
418 occupies on the screen. */
419
420 int
421 strwidth (str, len)
422 unsigned char *str;
423 int len;
424 {
425 return c_string_width (str, len, -1, NULL, NULL);
426 }
427
428 /* Return width of Lisp string STRING when displayed in the current
429 buffer. The width is measured by how many columns it occupies on
430 the screen while paying attention to compositions. If PRECISION >
431 0, return the width of longest substring that doesn't exceed
432 PRECISION, and set number of characters and bytes of the substring
433 in *NCHARS and *NBYTES respectively. */
434
435 int
436 lisp_string_width (string, precision, nchars, nbytes)
437 Lisp_Object string;
438 int precision, *nchars, *nbytes;
439 {
440 int len = SCHARS (string);
441 unsigned char *str = SDATA (string);
442 int i = 0, i_byte = 0;
443 int width = 0;
444 struct Lisp_Char_Table *dp = buffer_display_table ();
445
446 while (i < len)
447 {
448 int chars, bytes, thiswidth;
449 Lisp_Object val;
450 int cmp_id;
451 EMACS_INT ignore, end;
452
453 if (find_composition (i, -1, &ignore, &end, &val, string)
454 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
455 >= 0))
456 {
457 thiswidth = composition_table[cmp_id]->width;
458 chars = end - i;
459 bytes = string_char_to_byte (string, end) - i_byte;
460 }
461 else if (dp)
462 {
463 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
464
465 chars = 1;
466 val = DISP_CHAR_VECTOR (dp, c);
467 if (VECTORP (val))
468 thiswidth = XVECTOR (val)->size;
469 else
470 thiswidth = CHAR_WIDTH (c);
471 }
472 else
473 {
474 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
475
476 chars = 1;
477 thiswidth = CHAR_WIDTH (c);
478 }
479
480 if (precision > 0
481 && (width + thiswidth > precision))
482 {
483 *nchars = i;
484 *nbytes = i_byte;
485 return width;
486 }
487 i += chars;
488 i_byte += bytes;
489 width += thiswidth;
490 }
491
492 if (precision > 0)
493 {
494 *nchars = i;
495 *nbytes = i_byte;
496 }
497
498 return width;
499 }
500
501 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
502 doc: /* Return width of STRING when displayed in the current buffer.
503 Width is measured by how many columns it occupies on the screen.
504 When calculating width of a multibyte character in STRING,
505 only the base leading-code is considered; the validity of
506 the following bytes is not checked. Tabs in STRING are always
507 taken to occupy `tab-width' columns. */)
508 (str)
509 Lisp_Object str;
510 {
511 Lisp_Object val;
512
513 CHECK_STRING (str);
514 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
515 return val;
516 }
517
518 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
519 doc: /* Return the direction of CHAR.
520 The returned value is 0 for left-to-right and 1 for right-to-left. */)
521 (ch)
522 Lisp_Object ch;
523 {
524 int c;
525
526 CHECK_CHARACTER (ch);
527 c = XINT (ch);
528 return CHAR_TABLE_REF (Vchar_direction_table, c);
529 }
530
531 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
532 doc: /* Return number of characters between BEG and END.
533 This is now an obsolete function. We keep it just for backward compatibility. */)
534 (beg, end)
535 Lisp_Object beg, end;
536 {
537 int from, to;
538
539 CHECK_NUMBER_COERCE_MARKER (beg);
540 CHECK_NUMBER_COERCE_MARKER (end);
541
542 from = min (XFASTINT (beg), XFASTINT (end));
543 to = max (XFASTINT (beg), XFASTINT (end));
544
545 return make_number (to - from);
546 }
547
548 /* Return the number of characters in the NBYTES bytes at PTR.
549 This works by looking at the contents and checking for multibyte
550 sequences while assuming that there's no invalid sequence.
551 However, if the current buffer has enable-multibyte-characters =
552 nil, we treat each byte as a character. */
553
554 int
555 chars_in_text (ptr, nbytes)
556 const unsigned char *ptr;
557 int nbytes;
558 {
559 /* current_buffer is null at early stages of Emacs initialization. */
560 if (current_buffer == 0
561 || NILP (current_buffer->enable_multibyte_characters))
562 return nbytes;
563
564 return multibyte_chars_in_text (ptr, nbytes);
565 }
566
567 /* Return the number of characters in the NBYTES bytes at PTR.
568 This works by looking at the contents and checking for multibyte
569 sequences while assuming that there's no invalid sequence. It
570 ignores enable-multibyte-characters. */
571
572 int
573 multibyte_chars_in_text (ptr, nbytes)
574 const unsigned char *ptr;
575 int nbytes;
576 {
577 const unsigned char *endp = ptr + nbytes;
578 int chars = 0;
579
580 while (ptr < endp)
581 {
582 int len = MULTIBYTE_LENGTH (ptr, endp);
583
584 if (len == 0)
585 abort ();
586 ptr += len;
587 chars++;
588 }
589
590 return chars;
591 }
592
593 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
594 characters and bytes in it, and store them in *NCHARS and *NBYTES
595 respectively. On counting bytes, pay attention to that 8-bit
596 characters not constructing a valid multibyte sequence are
597 represented by 2-byte in a multibyte text. */
598
599 void
600 parse_str_as_multibyte (str, len, nchars, nbytes)
601 const unsigned char *str;
602 int len, *nchars, *nbytes;
603 {
604 const unsigned char *endp = str + len;
605 int n, chars = 0, bytes = 0;
606
607 if (len >= MAX_MULTIBYTE_LENGTH)
608 {
609 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
610 while (str < adjusted_endp)
611 {
612 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
613 str += n, bytes += n;
614 else
615 str++, bytes += 2;
616 chars++;
617 }
618 }
619 while (str < endp)
620 {
621 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
622 str += n, bytes += n;
623 else
624 str++, bytes += 2;
625 chars++;
626 }
627
628 *nchars = chars;
629 *nbytes = bytes;
630 return;
631 }
632
633 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
634 It actually converts only such 8-bit characters that don't contruct
635 a multibyte sequence to multibyte forms of Latin-1 characters. If
636 NCHARS is nonzero, set *NCHARS to the number of characters in the
637 text. It is assured that we can use LEN bytes at STR as a work
638 area and that is enough. Return the number of bytes of the
639 resulting text. */
640
641 int
642 str_as_multibyte (str, len, nbytes, nchars)
643 unsigned char *str;
644 int len, nbytes, *nchars;
645 {
646 unsigned char *p = str, *endp = str + nbytes;
647 unsigned char *to;
648 int chars = 0;
649 int n;
650
651 if (nbytes >= MAX_MULTIBYTE_LENGTH)
652 {
653 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
654 while (p < adjusted_endp
655 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
656 p += n, chars++;
657 }
658 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
659 p += n, chars++;
660 if (nchars)
661 *nchars = chars;
662 if (p == endp)
663 return nbytes;
664
665 to = p;
666 nbytes = endp - p;
667 endp = str + len;
668 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
669 p = endp - nbytes;
670
671 if (nbytes >= MAX_MULTIBYTE_LENGTH)
672 {
673 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
674 while (p < adjusted_endp)
675 {
676 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
677 {
678 while (n--)
679 *to++ = *p++;
680 }
681 else
682 {
683 int c = *p++;
684 c = BYTE8_TO_CHAR (c);
685 to += CHAR_STRING (c, to);
686 }
687 }
688 chars++;
689 }
690 while (p < endp)
691 {
692 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
693 {
694 while (n--)
695 *to++ = *p++;
696 }
697 else
698 {
699 int c = *p++;
700 c = BYTE8_TO_CHAR (c);
701 to += CHAR_STRING (c, to);
702 }
703 chars++;
704 }
705 if (nchars)
706 *nchars = chars;
707 return (to - str);
708 }
709
710 /* Parse unibyte string at STR of LEN bytes, and return the number of
711 bytes it may ocupy when converted to multibyte string by
712 `str_to_multibyte'. */
713
714 int
715 parse_str_to_multibyte (str, len)
716 unsigned char *str;
717 int len;
718 {
719 unsigned char *endp = str + len;
720 int bytes;
721
722 for (bytes = 0; str < endp; str++)
723 bytes += (*str < 0x80) ? 1 : 2;
724 return bytes;
725 }
726
727
728 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
729 that contains the same single-byte characters. It actually
730 converts all 8-bit characters to multibyte forms. It is assured
731 that we can use LEN bytes at STR as a work area and that is
732 enough. */
733
734 int
735 str_to_multibyte (str, len, bytes)
736 unsigned char *str;
737 int len, bytes;
738 {
739 unsigned char *p = str, *endp = str + bytes;
740 unsigned char *to;
741
742 while (p < endp && *p < 0x80) p++;
743 if (p == endp)
744 return bytes;
745 to = p;
746 bytes = endp - p;
747 endp = str + len;
748 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
749 p = endp - bytes;
750 while (p < endp)
751 {
752 int c = *p++;
753
754 if (c >= 0x80)
755 c = BYTE8_TO_CHAR (c);
756 to += CHAR_STRING (c, to);
757 }
758 return (to - str);
759 }
760
761 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
762 actually converts characters in the range 0x80..0xFF to
763 unibyte. */
764
765 int
766 str_as_unibyte (str, bytes)
767 unsigned char *str;
768 int bytes;
769 {
770 const unsigned char *p = str, *endp = str + bytes;
771 unsigned char *to;
772 int c, len;
773
774 while (p < endp)
775 {
776 c = *p;
777 len = BYTES_BY_CHAR_HEAD (c);
778 if (CHAR_BYTE8_HEAD_P (c))
779 break;
780 p += len;
781 }
782 to = str + (p - str);
783 while (p < endp)
784 {
785 c = *p;
786 len = BYTES_BY_CHAR_HEAD (c);
787 if (CHAR_BYTE8_HEAD_P (c))
788 {
789 c = STRING_CHAR_ADVANCE (p);
790 *to++ = CHAR_TO_BYTE8 (c);
791 }
792 else
793 {
794 while (len--) *to++ = *p++;
795 }
796 }
797 return (to - str);
798 }
799
800 int
801 string_count_byte8 (string)
802 Lisp_Object string;
803 {
804 int multibyte = STRING_MULTIBYTE (string);
805 int nbytes = SBYTES (string);
806 unsigned char *p = SDATA (string);
807 unsigned char *pend = p + nbytes;
808 int count = 0;
809 int c, len;
810
811 if (multibyte)
812 while (p < pend)
813 {
814 c = *p;
815 len = BYTES_BY_CHAR_HEAD (c);
816
817 if (CHAR_BYTE8_HEAD_P (c))
818 count++;
819 p += len;
820 }
821 else
822 while (p < pend)
823 {
824 if (*p++ >= 0x80)
825 count++;
826 }
827 return count;
828 }
829
830
831 Lisp_Object
832 string_escape_byte8 (string)
833 Lisp_Object string;
834 {
835 int nchars = SCHARS (string);
836 int nbytes = SBYTES (string);
837 int multibyte = STRING_MULTIBYTE (string);
838 int byte8_count;
839 const unsigned char *src, *src_end;
840 unsigned char *dst;
841 Lisp_Object val;
842 int c, len;
843
844 if (multibyte && nchars == nbytes)
845 return string;
846
847 byte8_count = string_count_byte8 (string);
848
849 if (byte8_count == 0)
850 return string;
851
852 if (multibyte)
853 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
854 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
855 nbytes + byte8_count * 2);
856 else
857 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
858 val = make_uninit_string (nbytes + byte8_count * 3);
859
860 src = SDATA (string);
861 src_end = src + nbytes;
862 dst = SDATA (val);
863 if (multibyte)
864 while (src < src_end)
865 {
866 c = *src;
867 len = BYTES_BY_CHAR_HEAD (c);
868
869 if (CHAR_BYTE8_HEAD_P (c))
870 {
871 c = STRING_CHAR_ADVANCE (src);
872 c = CHAR_TO_BYTE8 (c);
873 sprintf ((char *) dst, "\\%03o", c);
874 dst += 4;
875 }
876 else
877 while (len--) *dst++ = *src++;
878 }
879 else
880 while (src < src_end)
881 {
882 c = *src++;
883 if (c >= 0x80)
884 {
885 sprintf ((char *) dst, "\\%03o", c);
886 dst += 4;
887 }
888 else
889 *dst++ = c;
890 }
891 return val;
892 }
893
894 \f
895 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
896 doc: /*
897 Concatenate all the argument characters and make the result a string.
898 usage: (string &rest CHARACTERS) */)
899 (n, args)
900 int n;
901 Lisp_Object *args;
902 {
903 int i;
904 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
905 unsigned char *p = buf;
906 int c;
907
908 for (i = 0; i < n; i++)
909 {
910 CHECK_CHARACTER (args[i]);
911 c = XINT (args[i]);
912 p += CHAR_STRING (c, p);
913 }
914
915 return make_string_from_bytes ((char *) buf, n, p - buf);
916 }
917
918 void
919 init_character_once ()
920 {
921 }
922
923 #ifdef emacs
924
925 void
926 syms_of_character ()
927 {
928 DEFSYM (Qcharacterp, "characterp");
929 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
930
931 staticpro (&Vchar_unify_table);
932 Vchar_unify_table = Qnil;
933
934 defsubr (&Smax_char);
935 defsubr (&Scharacterp);
936 defsubr (&Sunibyte_char_to_multibyte);
937 defsubr (&Smultibyte_char_to_unibyte);
938 defsubr (&Schar_bytes);
939 defsubr (&Schar_width);
940 defsubr (&Sstring_width);
941 defsubr (&Schar_direction);
942 defsubr (&Schars_in_region);
943 defsubr (&Sstring);
944
945 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
946 doc: /*
947 Vector recording all translation tables ever defined.
948 Each element is a pair (SYMBOL . TABLE) relating the table to the
949 symbol naming it. The ID of a translation table is an index into this vector. */);
950 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
951
952 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
953 doc: /*
954 A char-table for characters which invoke auto-filling.
955 Such characters have value t in this table. */);
956 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
957 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
958 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
959
960 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
961 doc: /*
962 A char-table for width (columns) of each character. */);
963 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
964 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
965 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
966 make_number (4));
967
968 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
969 doc: /* A char-table for direction of each character. */);
970 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
971
972 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
973 doc: /* A char-table for each printable character. */);
974 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
975 Fset_char_table_range (Vprintable_chars,
976 Fcons (make_number (32), make_number (126)), Qt);
977 Fset_char_table_range (Vprintable_chars,
978 Fcons (make_number (160),
979 make_number (MAX_5_BYTE_CHAR)), Qt);
980
981 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
982 doc: /* Char table of script symbols.
983 It has one extra slot whose value is a list of script symbols. */);
984
985 /* Intern this now in case it isn't already done.
986 Setting this variable twice is harmless.
987 But don't staticpro it here--that is done in alloc.c. */
988 Qchar_table_extra_slots = intern ("char-table-extra-slots");
989 DEFSYM (Qchar_script_table, "char-script-table");
990 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
991 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
992 }
993
994 #endif /* emacs */
995
996 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
997 (do not change this comment) */