]> code.delx.au - gnu-emacs/blob - src/character.c
Revision: miles@gnu.org--gnu-2005/emacs--unicode--0--patch-7
[gnu-emacs] / src / character.c
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2003
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /* At first, see the document in `character.h' to understand the code
27 in this file. */
28
29 #ifdef emacs
30 #include <config.h>
31 #endif
32
33 #include <stdio.h>
34
35 #ifdef emacs
36
37 #include <sys/types.h>
38 #include "lisp.h"
39 #include "character.h"
40 #include "buffer.h"
41 #include "charset.h"
42 #include "composite.h"
43 #include "disptab.h"
44
45 #else /* not emacs */
46
47 #include "mulelib.h"
48
49 #endif /* emacs */
50
51 Lisp_Object Qcharacterp;
52
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector;
56
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars;
59
60 Lisp_Object Qauto_fill_chars;
61
62 /* Char-table of information about which character to unify to which
63 Unicode character. */
64 Lisp_Object Vchar_unify_table;
65
66 /* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68 Lisp_Object Vprintable_chars;
69
70 /* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72 Lisp_Object Vchar_width_table;
73
74 /* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76 Lisp_Object Vchar_direction_table;
77
78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
79 unsigned char *_fetch_multibyte_char_p;
80
81 /* Char table of scripts. */
82 Lisp_Object Vchar_script_table;
83
84 static Lisp_Object Qchar_script_table;
85
86 /* Mapping table from unibyte chars to multibyte chars. */
87 int unibyte_to_multibyte_table[256];
88
89 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
90 char. */
91 char unibyte_has_multibyte_table[256];
92
93 \f
94
95 /* Store multibyte form of character C at P. If C has modifier bits,
96 handle them appropriately. */
97
98 int
99 char_string (c, p)
100 int c;
101 unsigned char *p;
102 {
103 int bytes;
104
105 if (c & CHAR_MODIFIER_MASK)
106 {
107 /* As an non-ASCII character can't have modifier bits, we just
108 ignore the bits. */
109 if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
110 {
111 /* For Meta, Shift, and Control modifiers, we need special care. */
112 if (c & CHAR_META)
113 {
114 /* Move the meta bit to the right place for a string. */
115 c = (c & ~CHAR_META) | 0x80;
116 }
117 if (c & CHAR_SHIFT)
118 {
119 /* Shift modifier is valid only with [A-Za-z]. */
120 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
121 c &= ~CHAR_SHIFT;
122 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
123 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
124 }
125 if (c & CHAR_CTL)
126 {
127 /* Simulate the code in lread.c. */
128 /* Allow `\C- ' and `\C-?'. */
129 if (c == (CHAR_CTL | ' '))
130 c = 0;
131 else if (c == (CHAR_CTL | '?'))
132 c = 127;
133 /* ASCII control chars are made from letters (both cases),
134 as well as the non-letters within 0100...0137. */
135 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
136 c &= (037 | (~0177 & ~CHAR_CTL));
137 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
138 c &= (037 | (~0177 & ~CHAR_CTL));
139 }
140 }
141
142 /* If C still has any modifier bits, just ignore it. */
143 c &= ~CHAR_MODIFIER_MASK;
144 }
145
146 MAYBE_UNIFY_CHAR (c);
147
148 if (c <= MAX_3_BYTE_CHAR)
149 {
150 bytes = CHAR_STRING (c, p);
151 }
152 else if (c <= MAX_4_BYTE_CHAR)
153 {
154 p[0] = (0xF0 | (c >> 18));
155 p[1] = (0x80 | ((c >> 12) & 0x3F));
156 p[2] = (0x80 | ((c >> 6) & 0x3F));
157 p[3] = (0x80 | (c & 0x3F));
158 bytes = 4;
159 }
160 else if (c <= MAX_5_BYTE_CHAR)
161 {
162 p[0] = 0xF8;
163 p[1] = (0x80 | ((c >> 18) & 0x0F));
164 p[2] = (0x80 | ((c >> 12) & 0x3F));
165 p[3] = (0x80 | ((c >> 6) & 0x3F));
166 p[4] = (0x80 | (c & 0x3F));
167 bytes = 5;
168 }
169 else
170 {
171 c = CHAR_TO_BYTE8 (c);
172 bytes = BYTE8_STRING (c, p);
173 }
174
175 return bytes;
176 }
177
178
179 /* Return a character whose multibyte form is at P. Set LEN is not
180 NULL, it must be a pointer to integer. In that case, set *LEN to
181 the byte length of the multibyte form. If ADVANCED is not NULL, is
182 must be a pointer to unsigned char. In that case, set *ADVANCED to
183 the ending address (i.e. the starting address of the next
184 character) of the multibyte form. */
185
186 int
187 string_char (p, advanced, len)
188 const unsigned char *p;
189 const unsigned char **advanced;
190 int *len;
191 {
192 int c;
193 const unsigned char *saved_p = p;
194
195 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
196 {
197 c = STRING_CHAR_ADVANCE (p);
198 }
199 else if (! (*p & 0x08))
200 {
201 c = ((((p)[0] & 0xF) << 18)
202 | (((p)[1] & 0x3F) << 12)
203 | (((p)[2] & 0x3F) << 6)
204 | ((p)[3] & 0x3F));
205 p += 4;
206 }
207 else
208 {
209 c = ((((p)[1] & 0x3F) << 18)
210 | (((p)[2] & 0x3F) << 12)
211 | (((p)[3] & 0x3F) << 6)
212 | ((p)[4] & 0x3F));
213 p += 5;
214 }
215
216 MAYBE_UNIFY_CHAR (c);
217
218 if (len)
219 *len = p - saved_p;
220 if (advanced)
221 *advanced = p;
222 return c;
223 }
224
225
226 /* Translate character C by translation table TABLE. If C is
227 negative, translate a character specified by CHARSET and CODE. If
228 no translation is found in TABLE, return the untranslated
229 character. If TABLE is a list, elements are char tables. In this
230 case, translace C by all tables. */
231
232 int
233 translate_char (table, c)
234 Lisp_Object table;
235 int c;
236 {
237 if (CHAR_TABLE_P (table))
238 {
239 Lisp_Object ch;
240
241 ch = CHAR_TABLE_REF (table, c);
242 if (CHARACTERP (ch))
243 c = XINT (ch);
244 }
245 else
246 {
247 for (; CONSP (table); table = XCDR (table))
248 c = translate_char (XCAR (table), c);
249 }
250 return c;
251 }
252
253 /* Convert the multibyte character C to unibyte 8-bit character based
254 on the current value of charset_unibyte. If dimension of
255 charset_unibyte is more than one, return (C & 0xFF).
256
257 The argument REV_TBL is now ignored. It will be removed in the
258 future. */
259
260 int
261 multibyte_char_to_unibyte (c, rev_tbl)
262 int c;
263 Lisp_Object rev_tbl;
264 {
265 struct charset *charset;
266 unsigned c1;
267
268 if (CHAR_BYTE8_P (c))
269 return CHAR_TO_BYTE8 (c);
270 charset = CHARSET_FROM_ID (charset_unibyte);
271 c1 = ENCODE_CHAR (charset, c);
272 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
273 }
274
275
276 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
277 doc: /* Return non-nil if OBJECT is a character. */)
278 (object, ignore)
279 Lisp_Object object, ignore;
280 {
281 return (CHARACTERP (object) ? Qt : Qnil);
282 }
283
284 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
285 doc: /* Return the character of the maximum code. */)
286 ()
287 {
288 return make_number (MAX_CHAR);
289 }
290
291 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
292 Sunibyte_char_to_multibyte, 1, 1, 0,
293 doc: /* Convert the unibyte character CH to multibyte character.
294 The multibyte character is a result of decoding CH by
295 the current unibyte charset (see `unibyte-charset'). */)
296 (ch)
297 Lisp_Object ch;
298 {
299 int c;
300 struct charset *charset;
301
302 CHECK_CHARACTER (ch);
303 c = XFASTINT (ch);
304 if (c >= 0400)
305 error ("Invalid unibyte character: %d", c);
306 charset = CHARSET_FROM_ID (charset_unibyte);
307 c = DECODE_CHAR (charset, c);
308 if (c < 0)
309 c = BYTE8_TO_CHAR (XFASTINT (ch));
310 return make_number (c);
311 }
312
313 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
314 Smultibyte_char_to_unibyte, 1, 1, 0,
315 doc: /* Convert the multibyte character CH to unibyte character.\n\
316 The unibyte character is a result of encoding CH by
317 the current primary charset (value of `charset-primary'). */)
318 (ch)
319 Lisp_Object ch;
320 {
321 int c;
322
323 CHECK_CHARACTER (ch);
324 c = XFASTINT (ch);
325 c = CHAR_TO_BYTE8 (c);
326 return make_number (c);
327 }
328
329 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
330 doc: /* Return 1 regardless of the argument CHAR.
331 This is now an obsolete function. We keep it just for backward compatibility. */)
332 (ch)
333 Lisp_Object ch;
334 {
335 CHECK_CHARACTER (ch);
336 return make_number (1);
337 }
338
339 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
340 doc: /* Return width of CHAR when displayed in the current buffer.
341 The width is measured by how many columns it occupies on the screen.
342 Tab is taken to occupy `tab-width' columns. */)
343 (ch)
344 Lisp_Object ch;
345 {
346 Lisp_Object disp;
347 int c, width;
348 struct Lisp_Char_Table *dp = buffer_display_table ();
349
350 CHECK_CHARACTER (ch);
351 c = XINT (ch);
352
353 /* Get the way the display table would display it. */
354 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
355
356 if (VECTORP (disp))
357 width = ASIZE (disp);
358 else
359 width = CHAR_WIDTH (c);
360
361 return make_number (width);
362 }
363
364 /* Return width of string STR of length LEN when displayed in the
365 current buffer. The width is measured by how many columns it
366 occupies on the screen. If PRECISION > 0, return the width of
367 longest substring that doesn't exceed PRECISION, and set number of
368 characters and bytes of the substring in *NCHARS and *NBYTES
369 respectively. */
370
371 int
372 c_string_width (str, len, precision, nchars, nbytes)
373 const unsigned char *str;
374 int precision, *nchars, *nbytes;
375 {
376 int i = 0, i_byte = 0;
377 int width = 0;
378 struct Lisp_Char_Table *dp = buffer_display_table ();
379
380 while (i_byte < len)
381 {
382 int bytes, thiswidth;
383 Lisp_Object val;
384 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
385
386 if (dp)
387 {
388 val = DISP_CHAR_VECTOR (dp, c);
389 if (VECTORP (val))
390 thiswidth = XVECTOR (val)->size;
391 else
392 thiswidth = CHAR_WIDTH (c);
393 }
394 else
395 {
396 thiswidth = CHAR_WIDTH (c);
397 }
398
399 if (precision > 0
400 && (width + thiswidth > precision))
401 {
402 *nchars = i;
403 *nbytes = i_byte;
404 return width;
405 }
406 i++;
407 i_byte += bytes;
408 width += thiswidth;
409 }
410
411 if (precision > 0)
412 {
413 *nchars = i;
414 *nbytes = i_byte;
415 }
416
417 return width;
418 }
419
420 /* Return width of string STR of length LEN when displayed in the
421 current buffer. The width is measured by how many columns it
422 occupies on the screen. */
423
424 int
425 strwidth (str, len)
426 unsigned char *str;
427 int len;
428 {
429 return c_string_width (str, len, -1, NULL, NULL);
430 }
431
432 /* Return width of Lisp string STRING when displayed in the current
433 buffer. The width is measured by how many columns it occupies on
434 the screen while paying attention to compositions. If PRECISION >
435 0, return the width of longest substring that doesn't exceed
436 PRECISION, and set number of characters and bytes of the substring
437 in *NCHARS and *NBYTES respectively. */
438
439 int
440 lisp_string_width (string, precision, nchars, nbytes)
441 Lisp_Object string;
442 int precision, *nchars, *nbytes;
443 {
444 int len = SCHARS (string);
445 unsigned char *str = SDATA (string);
446 int i = 0, i_byte = 0;
447 int width = 0;
448 struct Lisp_Char_Table *dp = buffer_display_table ();
449
450 while (i < len)
451 {
452 int chars, bytes, thiswidth;
453 Lisp_Object val;
454 int cmp_id;
455 EMACS_INT ignore, end;
456
457 if (find_composition (i, -1, &ignore, &end, &val, string)
458 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
459 >= 0))
460 {
461 thiswidth = composition_table[cmp_id]->width;
462 chars = end - i;
463 bytes = string_char_to_byte (string, end) - i_byte;
464 }
465 else if (dp)
466 {
467 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
468
469 chars = 1;
470 val = DISP_CHAR_VECTOR (dp, c);
471 if (VECTORP (val))
472 thiswidth = XVECTOR (val)->size;
473 else
474 thiswidth = CHAR_WIDTH (c);
475 }
476 else
477 {
478 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
479
480 chars = 1;
481 thiswidth = CHAR_WIDTH (c);
482 }
483
484 if (precision > 0
485 && (width + thiswidth > precision))
486 {
487 *nchars = i;
488 *nbytes = i_byte;
489 return width;
490 }
491 i += chars;
492 i_byte += bytes;
493 width += thiswidth;
494 }
495
496 if (precision > 0)
497 {
498 *nchars = i;
499 *nbytes = i_byte;
500 }
501
502 return width;
503 }
504
505 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
506 doc: /* Return width of STRING when displayed in the current buffer.
507 Width is measured by how many columns it occupies on the screen.
508 When calculating width of a multibyte character in STRING,
509 only the base leading-code is considered; the validity of
510 the following bytes is not checked. Tabs in STRING are always
511 taken to occupy `tab-width' columns. */)
512 (str)
513 Lisp_Object str;
514 {
515 Lisp_Object val;
516
517 CHECK_STRING (str);
518 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
519 return val;
520 }
521
522 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
523 doc: /* Return the direction of CHAR.
524 The returned value is 0 for left-to-right and 1 for right-to-left. */)
525 (ch)
526 Lisp_Object ch;
527 {
528 int c;
529
530 CHECK_CHARACTER (ch);
531 c = XINT (ch);
532 return CHAR_TABLE_REF (Vchar_direction_table, c);
533 }
534
535 DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
536 doc: /* Return number of characters between BEG and END.
537 This is now an obsolete function. We keep it just for backward compatibility. */)
538 (beg, end)
539 Lisp_Object beg, end;
540 {
541 int from, to;
542
543 CHECK_NUMBER_COERCE_MARKER (beg);
544 CHECK_NUMBER_COERCE_MARKER (end);
545
546 from = min (XFASTINT (beg), XFASTINT (end));
547 to = max (XFASTINT (beg), XFASTINT (end));
548
549 return make_number (to - from);
550 }
551
552 /* Return the number of characters in the NBYTES bytes at PTR.
553 This works by looking at the contents and checking for multibyte
554 sequences while assuming that there's no invalid sequence.
555 However, if the current buffer has enable-multibyte-characters =
556 nil, we treat each byte as a character. */
557
558 int
559 chars_in_text (ptr, nbytes)
560 const unsigned char *ptr;
561 int nbytes;
562 {
563 /* current_buffer is null at early stages of Emacs initialization. */
564 if (current_buffer == 0
565 || NILP (current_buffer->enable_multibyte_characters))
566 return nbytes;
567
568 return multibyte_chars_in_text (ptr, nbytes);
569 }
570
571 /* Return the number of characters in the NBYTES bytes at PTR.
572 This works by looking at the contents and checking for multibyte
573 sequences while assuming that there's no invalid sequence. It
574 ignores enable-multibyte-characters. */
575
576 int
577 multibyte_chars_in_text (ptr, nbytes)
578 const unsigned char *ptr;
579 int nbytes;
580 {
581 const unsigned char *endp = ptr + nbytes;
582 int chars = 0;
583
584 while (ptr < endp)
585 {
586 int len = MULTIBYTE_LENGTH (ptr, endp);
587
588 if (len == 0)
589 abort ();
590 ptr += len;
591 chars++;
592 }
593
594 return chars;
595 }
596
597 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
598 characters and bytes in it, and store them in *NCHARS and *NBYTES
599 respectively. On counting bytes, pay attention to that 8-bit
600 characters not constructing a valid multibyte sequence are
601 represented by 2-byte in a multibyte text. */
602
603 void
604 parse_str_as_multibyte (str, len, nchars, nbytes)
605 const unsigned char *str;
606 int len, *nchars, *nbytes;
607 {
608 const unsigned char *endp = str + len;
609 int n, chars = 0, bytes = 0;
610
611 if (len >= MAX_MULTIBYTE_LENGTH)
612 {
613 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
614 while (str < adjusted_endp)
615 {
616 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
617 str += n, bytes += n;
618 else
619 str++, bytes += 2;
620 chars++;
621 }
622 }
623 while (str < endp)
624 {
625 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
626 str += n, bytes += n;
627 else
628 str++, bytes += 2;
629 chars++;
630 }
631
632 *nchars = chars;
633 *nbytes = bytes;
634 return;
635 }
636
637 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
638 It actually converts only such 8-bit characters that don't contruct
639 a multibyte sequence to multibyte forms of Latin-1 characters. If
640 NCHARS is nonzero, set *NCHARS to the number of characters in the
641 text. It is assured that we can use LEN bytes at STR as a work
642 area and that is enough. Return the number of bytes of the
643 resulting text. */
644
645 int
646 str_as_multibyte (str, len, nbytes, nchars)
647 unsigned char *str;
648 int len, nbytes, *nchars;
649 {
650 unsigned char *p = str, *endp = str + nbytes;
651 unsigned char *to;
652 int chars = 0;
653 int n;
654
655 if (nbytes >= MAX_MULTIBYTE_LENGTH)
656 {
657 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
658 while (p < adjusted_endp
659 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
660 p += n, chars++;
661 }
662 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
663 p += n, chars++;
664 if (nchars)
665 *nchars = chars;
666 if (p == endp)
667 return nbytes;
668
669 to = p;
670 nbytes = endp - p;
671 endp = str + len;
672 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
673 p = endp - nbytes;
674
675 if (nbytes >= MAX_MULTIBYTE_LENGTH)
676 {
677 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
678 while (p < adjusted_endp)
679 {
680 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
681 {
682 while (n--)
683 *to++ = *p++;
684 }
685 else
686 {
687 int c = *p++;
688 c = BYTE8_TO_CHAR (c);
689 to += CHAR_STRING (c, to);
690 }
691 }
692 chars++;
693 }
694 while (p < endp)
695 {
696 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
697 {
698 while (n--)
699 *to++ = *p++;
700 }
701 else
702 {
703 int c = *p++;
704 c = BYTE8_TO_CHAR (c);
705 to += CHAR_STRING (c, to);
706 }
707 chars++;
708 }
709 if (nchars)
710 *nchars = chars;
711 return (to - str);
712 }
713
714 /* Parse unibyte string at STR of LEN bytes, and return the number of
715 bytes it may ocupy when converted to multibyte string by
716 `str_to_multibyte'. */
717
718 int
719 parse_str_to_multibyte (str, len)
720 unsigned char *str;
721 int len;
722 {
723 unsigned char *endp = str + len;
724 int bytes;
725
726 for (bytes = 0; str < endp; str++)
727 bytes += (*str < 0x80) ? 1 : 2;
728 return bytes;
729 }
730
731
732 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
733 that contains the same single-byte characters. It actually
734 converts all 8-bit characters to multibyte forms. It is assured
735 that we can use LEN bytes at STR as a work area and that is
736 enough. */
737
738 int
739 str_to_multibyte (str, len, bytes)
740 unsigned char *str;
741 int len, bytes;
742 {
743 unsigned char *p = str, *endp = str + bytes;
744 unsigned char *to;
745
746 while (p < endp && *p < 0x80) p++;
747 if (p == endp)
748 return bytes;
749 to = p;
750 bytes = endp - p;
751 endp = str + len;
752 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
753 p = endp - bytes;
754 while (p < endp)
755 {
756 int c = *p++;
757
758 if (c >= 0x80)
759 c = BYTE8_TO_CHAR (c);
760 to += CHAR_STRING (c, to);
761 }
762 return (to - str);
763 }
764
765 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
766 actually converts characters in the range 0x80..0xFF to
767 unibyte. */
768
769 int
770 str_as_unibyte (str, bytes)
771 unsigned char *str;
772 int bytes;
773 {
774 const unsigned char *p = str, *endp = str + bytes;
775 unsigned char *to;
776 int c, len;
777
778 while (p < endp)
779 {
780 c = *p;
781 len = BYTES_BY_CHAR_HEAD (c);
782 if (CHAR_BYTE8_HEAD_P (c))
783 break;
784 p += len;
785 }
786 to = str + (p - str);
787 while (p < endp)
788 {
789 c = *p;
790 len = BYTES_BY_CHAR_HEAD (c);
791 if (CHAR_BYTE8_HEAD_P (c))
792 {
793 c = STRING_CHAR_ADVANCE (p);
794 *to++ = CHAR_TO_BYTE8 (c);
795 }
796 else
797 {
798 while (len--) *to++ = *p++;
799 }
800 }
801 return (to - str);
802 }
803
804 int
805 string_count_byte8 (string)
806 Lisp_Object string;
807 {
808 int multibyte = STRING_MULTIBYTE (string);
809 int nbytes = SBYTES (string);
810 unsigned char *p = SDATA (string);
811 unsigned char *pend = p + nbytes;
812 int count = 0;
813 int c, len;
814
815 if (multibyte)
816 while (p < pend)
817 {
818 c = *p;
819 len = BYTES_BY_CHAR_HEAD (c);
820
821 if (CHAR_BYTE8_HEAD_P (c))
822 count++;
823 p += len;
824 }
825 else
826 while (p < pend)
827 {
828 if (*p++ >= 0x80)
829 count++;
830 }
831 return count;
832 }
833
834
835 Lisp_Object
836 string_escape_byte8 (string)
837 Lisp_Object string;
838 {
839 int nchars = SCHARS (string);
840 int nbytes = SBYTES (string);
841 int multibyte = STRING_MULTIBYTE (string);
842 int byte8_count;
843 const unsigned char *src, *src_end;
844 unsigned char *dst;
845 Lisp_Object val;
846 int c, len;
847
848 if (multibyte && nchars == nbytes)
849 return string;
850
851 byte8_count = string_count_byte8 (string);
852
853 if (byte8_count == 0)
854 return string;
855
856 if (multibyte)
857 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
858 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
859 nbytes + byte8_count * 2);
860 else
861 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
862 val = make_uninit_string (nbytes + byte8_count * 3);
863
864 src = SDATA (string);
865 src_end = src + nbytes;
866 dst = SDATA (val);
867 if (multibyte)
868 while (src < src_end)
869 {
870 c = *src;
871 len = BYTES_BY_CHAR_HEAD (c);
872
873 if (CHAR_BYTE8_HEAD_P (c))
874 {
875 c = STRING_CHAR_ADVANCE (src);
876 c = CHAR_TO_BYTE8 (c);
877 sprintf ((char *) dst, "\\%03o", c);
878 dst += 4;
879 }
880 else
881 while (len--) *dst++ = *src++;
882 }
883 else
884 while (src < src_end)
885 {
886 c = *src++;
887 if (c >= 0x80)
888 {
889 sprintf ((char *) dst, "\\%03o", c);
890 dst += 4;
891 }
892 else
893 *dst++ = c;
894 }
895 return val;
896 }
897
898 \f
899 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
900 doc: /*
901 Concatenate all the argument characters and make the result a string.
902 usage: (string &rest CHARACTERS) */)
903 (n, args)
904 int n;
905 Lisp_Object *args;
906 {
907 int i;
908 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
909 unsigned char *p = buf;
910 int c;
911
912 for (i = 0; i < n; i++)
913 {
914 CHECK_CHARACTER (args[i]);
915 c = XINT (args[i]);
916 p += CHAR_STRING (c, p);
917 }
918
919 return make_string_from_bytes ((char *) buf, n, p - buf);
920 }
921
922 void
923 init_character_once ()
924 {
925 }
926
927 #ifdef emacs
928
929 void
930 syms_of_character ()
931 {
932 DEFSYM (Qcharacterp, "characterp");
933 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
934
935 staticpro (&Vchar_unify_table);
936 Vchar_unify_table = Qnil;
937
938 defsubr (&Smax_char);
939 defsubr (&Scharacterp);
940 defsubr (&Sunibyte_char_to_multibyte);
941 defsubr (&Smultibyte_char_to_unibyte);
942 defsubr (&Schar_bytes);
943 defsubr (&Schar_width);
944 defsubr (&Sstring_width);
945 defsubr (&Schar_direction);
946 defsubr (&Schars_in_region);
947 defsubr (&Sstring);
948
949 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
950 doc: /*
951 Vector recording all translation tables ever defined.
952 Each element is a pair (SYMBOL . TABLE) relating the table to the
953 symbol naming it. The ID of a translation table is an index into this vector. */);
954 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
955
956 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
957 doc: /*
958 A char-table for characters which invoke auto-filling.
959 Such characters have value t in this table. */);
960 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
961 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
962 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
963
964 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
965 doc: /*
966 A char-table for width (columns) of each character. */);
967 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
968 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
969 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
970 make_number (4));
971
972 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
973 doc: /* A char-table for direction of each character. */);
974 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
975
976 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
977 doc: /* A char-table for each printable character. */);
978 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
979 Fset_char_table_range (Vprintable_chars,
980 Fcons (make_number (32), make_number (126)), Qt);
981 Fset_char_table_range (Vprintable_chars,
982 Fcons (make_number (160),
983 make_number (MAX_5_BYTE_CHAR)), Qt);
984
985 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
986 doc: /* Char table of script symbols.
987 It has one extra slot whose value is a list of script symbols. */);
988
989 /* Intern this now in case it isn't already done.
990 Setting this variable twice is harmless.
991 But don't staticpro it here--that is done in alloc.c. */
992 Qchar_table_extra_slots = intern ("char-table-extra-slots");
993 DEFSYM (Qchar_script_table, "char-script-table");
994 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
995 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
996 }
997
998 #endif /* emacs */
999
1000 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1001 (do not change this comment) */