]> code.delx.au - gnu-emacs/blob - src/character.c
Copy copyright fix from 2010-06-26T12:01:31Z!eliz@gnu.org to one more file.
[gnu-emacs] / src / character.c
1 /* Basic character support.
2 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
3 2010, 2011, 2012 Free Software Foundation, Inc.
4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
5 Licensed to the Free Software Foundation.
6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
9
10 This file is part of GNU Emacs.
11
12 GNU Emacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
16
17 GNU Emacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License
23 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
24
25 /* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28 #ifdef emacs
29 #include <config.h>
30 #endif
31
32 #include <stdio.h>
33
34 #ifdef emacs
35
36 #include <sys/types.h>
37 #include <setjmp.h>
38 #include "lisp.h"
39 #include "character.h"
40 #include "buffer.h"
41 #include "charset.h"
42 #include "composite.h"
43 #include "disptab.h"
44
45 #else /* not emacs */
46
47 #include "mulelib.h"
48
49 #endif /* emacs */
50
51 Lisp_Object Qcharacterp;
52
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector;
56
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars;
59
60 Lisp_Object Qauto_fill_chars;
61
62 /* Char-table of information about which character to unify to which
63 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
64 Lisp_Object Vchar_unify_table;
65
66 /* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68 Lisp_Object Vprintable_chars;
69
70 /* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72 Lisp_Object Vchar_width_table;
73
74 /* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76 Lisp_Object Vchar_direction_table;
77
78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
79 unsigned char *_fetch_multibyte_char_p;
80
81 /* Char table of scripts. */
82 Lisp_Object Vchar_script_table;
83
84 /* Alist of scripts vs representative characters. */
85 Lisp_Object Vscript_representative_chars;
86
87 static Lisp_Object Qchar_script_table;
88
89 Lisp_Object Vunicode_category_table;
90 \f
91
92 /* If character code C has modifier masks, reflect them to the
93 character code if possible. Return the resulting code. */
94
95 int
96 char_resolve_modifier_mask (c)
97 int c;
98 {
99 /* A non-ASCII character can't reflect modifier bits to the code. */
100 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
101 return c;
102
103 /* For Meta, Shift, and Control modifiers, we need special care. */
104 if (c & CHAR_SHIFT)
105 {
106 /* Shift modifier is valid only with [A-Za-z]. */
107 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
108 c &= ~CHAR_SHIFT;
109 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
110 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
111 /* Shift modifier for control characters and SPC is ignored. */
112 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
113 c &= ~CHAR_SHIFT;
114 }
115 if (c & CHAR_CTL)
116 {
117 /* Simulate the code in lread.c. */
118 /* Allow `\C- ' and `\C-?'. */
119 if ((c & 0377) == ' ')
120 c &= ~0177 & ~ CHAR_CTL;
121 else if ((c & 0377) == '?')
122 c = 0177 | (c & ~0177 & ~CHAR_CTL);
123 /* ASCII control chars are made from letters (both cases),
124 as well as the non-letters within 0100...0137. */
125 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
126 c &= (037 | (~0177 & ~CHAR_CTL));
127 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
128 c &= (037 | (~0177 & ~CHAR_CTL));
129 }
130 #if 0 /* This is outside the scope of this function. (bug#4751) */
131 if (c & CHAR_META)
132 {
133 /* Move the meta bit to the right place for a string. */
134 c = (c & ~CHAR_META) | 0x80;
135 }
136 #endif
137
138 return c;
139 }
140
141
142 /* Store multibyte form of character C at P. If C has modifier bits,
143 handle them appropriately. */
144
145 int
146 char_string (c, p)
147 unsigned c;
148 unsigned char *p;
149 {
150 int bytes;
151
152 if (c & CHAR_MODIFIER_MASK)
153 {
154 c = (unsigned) char_resolve_modifier_mask ((int) c);
155 /* If C still has any modifier bits, just ignore it. */
156 c &= ~CHAR_MODIFIER_MASK;
157 }
158
159 MAYBE_UNIFY_CHAR (c);
160
161 if (c <= MAX_3_BYTE_CHAR)
162 {
163 bytes = CHAR_STRING (c, p);
164 }
165 else if (c <= MAX_4_BYTE_CHAR)
166 {
167 p[0] = (0xF0 | (c >> 18));
168 p[1] = (0x80 | ((c >> 12) & 0x3F));
169 p[2] = (0x80 | ((c >> 6) & 0x3F));
170 p[3] = (0x80 | (c & 0x3F));
171 bytes = 4;
172 }
173 else if (c <= MAX_5_BYTE_CHAR)
174 {
175 p[0] = 0xF8;
176 p[1] = (0x80 | ((c >> 18) & 0x0F));
177 p[2] = (0x80 | ((c >> 12) & 0x3F));
178 p[3] = (0x80 | ((c >> 6) & 0x3F));
179 p[4] = (0x80 | (c & 0x3F));
180 bytes = 5;
181 }
182 else if (c <= MAX_CHAR)
183 {
184 c = CHAR_TO_BYTE8 (c);
185 bytes = BYTE8_STRING (c, p);
186 }
187 else
188 error ("Invalid character: %d", c);
189
190 return bytes;
191 }
192
193
194 /* Return a character whose multibyte form is at P. Set LEN is not
195 NULL, it must be a pointer to integer. In that case, set *LEN to
196 the byte length of the multibyte form. If ADVANCED is not NULL, is
197 must be a pointer to unsigned char. In that case, set *ADVANCED to
198 the ending address (i.e. the starting address of the next
199 character) of the multibyte form. */
200
201 int
202 string_char (p, advanced, len)
203 const unsigned char *p;
204 const unsigned char **advanced;
205 int *len;
206 {
207 int c;
208 const unsigned char *saved_p = p;
209
210 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
211 {
212 c = STRING_CHAR_ADVANCE (p);
213 }
214 else if (! (*p & 0x08))
215 {
216 c = ((((p)[0] & 0xF) << 18)
217 | (((p)[1] & 0x3F) << 12)
218 | (((p)[2] & 0x3F) << 6)
219 | ((p)[3] & 0x3F));
220 p += 4;
221 }
222 else
223 {
224 c = ((((p)[1] & 0x3F) << 18)
225 | (((p)[2] & 0x3F) << 12)
226 | (((p)[3] & 0x3F) << 6)
227 | ((p)[4] & 0x3F));
228 p += 5;
229 }
230
231 MAYBE_UNIFY_CHAR (c);
232
233 if (len)
234 *len = p - saved_p;
235 if (advanced)
236 *advanced = p;
237 return c;
238 }
239
240
241 /* Translate character C by translation table TABLE. If C is
242 negative, translate a character specified by CHARSET and CODE. If
243 no translation is found in TABLE, return the untranslated
244 character. If TABLE is a list, elements are char tables. In this
245 case, translace C by all tables. */
246
247 int
248 translate_char (table, c)
249 Lisp_Object table;
250 int c;
251 {
252 if (CHAR_TABLE_P (table))
253 {
254 Lisp_Object ch;
255
256 ch = CHAR_TABLE_REF (table, c);
257 if (CHARACTERP (ch))
258 c = XINT (ch);
259 }
260 else
261 {
262 for (; CONSP (table); table = XCDR (table))
263 c = translate_char (XCAR (table), c);
264 }
265 return c;
266 }
267
268 /* Convert ASCII or 8-bit character C to unibyte. If C is none of
269 them, return (C & 0xFF).
270
271 The argument REV_TBL is now ignored. It will be removed in the
272 future. */
273
274 int
275 multibyte_char_to_unibyte (c, rev_tbl)
276 int c;
277 Lisp_Object rev_tbl;
278 {
279 if (c < 0x80)
280 return c;
281 if (CHAR_BYTE8_P (c))
282 return CHAR_TO_BYTE8 (c);
283 return (c & 0xFF);
284 }
285
286 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
287 by charset_unibyte. */
288
289 int
290 multibyte_char_to_unibyte_safe (c)
291 int c;
292 {
293 if (c < 0x80)
294 return c;
295 if (CHAR_BYTE8_P (c))
296 return CHAR_TO_BYTE8 (c);
297 return -1;
298 }
299
300 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
301 doc: /* Return non-nil if OBJECT is a character. */)
302 (object, ignore)
303 Lisp_Object object, ignore;
304 {
305 return (CHARACTERP (object) ? Qt : Qnil);
306 }
307
308 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
309 doc: /* Return the character of the maximum code. */)
310 ()
311 {
312 return make_number (MAX_CHAR);
313 }
314
315 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
316 Sunibyte_char_to_multibyte, 1, 1, 0,
317 doc: /* Convert the byte CH to multibyte character. */)
318 (ch)
319 Lisp_Object ch;
320 {
321 int c;
322
323 CHECK_CHARACTER (ch);
324 c = XFASTINT (ch);
325 if (c >= 0x100)
326 error ("Not a unibyte character: %d", c);
327 MAKE_CHAR_MULTIBYTE (c);
328 return make_number (c);
329 }
330
331 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
332 Smultibyte_char_to_unibyte, 1, 1, 0,
333 doc: /* Convert the multibyte character CH to a byte.
334 If the multibyte character does not represent a byte, return -1. */)
335 (ch)
336 Lisp_Object ch;
337 {
338 int cm;
339
340 CHECK_CHARACTER (ch);
341 cm = XFASTINT (ch);
342 if (cm < 256)
343 /* Can't distinguish a byte read from a unibyte buffer from
344 a latin1 char, so let's let it slide. */
345 return ch;
346 else
347 {
348 int cu = CHAR_TO_BYTE_SAFE (cm);
349 return make_number (cu);
350 }
351 }
352
353 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
354 doc: /* Return 1 regardless of the argument CHAR.
355 This is now an obsolete function. We keep it just for backward compatibility.
356 usage: (char-bytes CHAR) */)
357 (ch)
358 Lisp_Object ch;
359 {
360 CHECK_CHARACTER (ch);
361 return make_number (1);
362 }
363
364
365 /* Return width (columns) of C considering the buffer display table DP. */
366
367 static int
368 char_width (int c, struct Lisp_Char_Table *dp)
369 {
370 int width = CHAR_WIDTH (c);
371
372 if (dp)
373 {
374 Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
375 int i;
376
377 if (VECTORP (disp))
378 for (i = 0, width = 0; i < ASIZE (disp); i++)
379 {
380 ch = AREF (disp, i);
381 if (CHARACTERP (ch))
382 width += CHAR_WIDTH (XFASTINT (ch));
383 }
384 }
385 return width;
386 }
387
388
389 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
390 doc: /* Return width of CHAR when displayed in the current buffer.
391 The width is measured by how many columns it occupies on the screen.
392 Tab is taken to occupy `tab-width' columns.
393 usage: (char-width CHAR) */)
394 (ch)
395 Lisp_Object ch;
396 {
397 int c, width;
398
399 CHECK_CHARACTER (ch);
400 c = XINT (ch);
401
402 width = char_width (c, buffer_display_table ());
403 return make_number (width);
404 }
405
406 /* Return width of string STR of length LEN when displayed in the
407 current buffer. The width is measured by how many columns it
408 occupies on the screen. If PRECISION > 0, return the width of
409 longest substring that doesn't exceed PRECISION, and set number of
410 characters and bytes of the substring in *NCHARS and *NBYTES
411 respectively. */
412
413 int
414 c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
415 {
416 int i = 0, i_byte = 0;
417 int width = 0;
418 struct Lisp_Char_Table *dp = buffer_display_table ();
419
420 while (i_byte < len)
421 {
422 int bytes;
423 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
424 int thiswidth = char_width (c, dp);
425
426 if (precision > 0
427 && (width + thiswidth > precision))
428 {
429 *nchars = i;
430 *nbytes = i_byte;
431 return width;
432 }
433 i++;
434 i_byte += bytes;
435 width += thiswidth;
436 }
437
438 if (precision > 0)
439 {
440 *nchars = i;
441 *nbytes = i_byte;
442 }
443
444 return width;
445 }
446
447 /* Return width of string STR of length LEN when displayed in the
448 current buffer. The width is measured by how many columns it
449 occupies on the screen. */
450
451 int
452 strwidth (str, len)
453 unsigned char *str;
454 int len;
455 {
456 return c_string_width (str, len, -1, NULL, NULL);
457 }
458
459 /* Return width of Lisp string STRING when displayed in the current
460 buffer. The width is measured by how many columns it occupies on
461 the screen while paying attention to compositions. If PRECISION >
462 0, return the width of longest substring that doesn't exceed
463 PRECISION, and set number of characters and bytes of the substring
464 in *NCHARS and *NBYTES respectively. */
465
466 int
467 lisp_string_width (string, precision, nchars, nbytes)
468 Lisp_Object string;
469 int precision, *nchars, *nbytes;
470 {
471 int len = SCHARS (string);
472 /* This set multibyte to 0 even if STRING is multibyte when it
473 contains only ascii and eight-bit-graphic, but that's
474 intentional. */
475 int multibyte = len < SBYTES (string);
476 unsigned char *str = SDATA (string);
477 int i = 0, i_byte = 0;
478 int width = 0;
479 struct Lisp_Char_Table *dp = buffer_display_table ();
480
481 while (i < len)
482 {
483 int chars, bytes, thiswidth;
484 Lisp_Object val;
485 int cmp_id;
486 EMACS_INT ignore, end;
487
488 if (find_composition (i, -1, &ignore, &end, &val, string)
489 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
490 >= 0))
491 {
492 thiswidth = composition_table[cmp_id]->width;
493 chars = end - i;
494 bytes = string_char_to_byte (string, end) - i_byte;
495 }
496 else
497 {
498 int c;
499
500 if (multibyte)
501 c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
502 else
503 c = str[i_byte], bytes = 1;
504 chars = 1;
505 thiswidth = char_width (c, dp);
506 }
507
508 if (precision > 0
509 && (width + thiswidth > precision))
510 {
511 *nchars = i;
512 *nbytes = i_byte;
513 return width;
514 }
515 i += chars;
516 i_byte += bytes;
517 width += thiswidth;
518 }
519
520 if (precision > 0)
521 {
522 *nchars = i;
523 *nbytes = i_byte;
524 }
525
526 return width;
527 }
528
529 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
530 doc: /* Return width of STRING when displayed in the current buffer.
531 Width is measured by how many columns it occupies on the screen.
532 When calculating width of a multibyte character in STRING,
533 only the base leading-code is considered; the validity of
534 the following bytes is not checked. Tabs in STRING are always
535 taken to occupy `tab-width' columns.
536 usage: (string-width STRING) */)
537 (str)
538 Lisp_Object str;
539 {
540 Lisp_Object val;
541
542 CHECK_STRING (str);
543 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
544 return val;
545 }
546
547 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
548 doc: /* Return the direction of CHAR.
549 The returned value is 0 for left-to-right and 1 for right-to-left.
550 usage: (char-direction CHAR) */)
551 (ch)
552 Lisp_Object ch;
553 {
554 int c;
555
556 CHECK_CHARACTER (ch);
557 c = XINT (ch);
558 return CHAR_TABLE_REF (Vchar_direction_table, c);
559 }
560
561 /* Return the number of characters in the NBYTES bytes at PTR.
562 This works by looking at the contents and checking for multibyte
563 sequences while assuming that there's no invalid sequence.
564 However, if the current buffer has enable-multibyte-characters =
565 nil, we treat each byte as a character. */
566
567 EMACS_INT
568 chars_in_text (ptr, nbytes)
569 const unsigned char *ptr;
570 EMACS_INT nbytes;
571 {
572 /* current_buffer is null at early stages of Emacs initialization. */
573 if (current_buffer == 0
574 || NILP (current_buffer->enable_multibyte_characters))
575 return nbytes;
576
577 return multibyte_chars_in_text (ptr, nbytes);
578 }
579
580 /* Return the number of characters in the NBYTES bytes at PTR.
581 This works by looking at the contents and checking for multibyte
582 sequences while assuming that there's no invalid sequence. It
583 ignores enable-multibyte-characters. */
584
585 EMACS_INT
586 multibyte_chars_in_text (ptr, nbytes)
587 const unsigned char *ptr;
588 EMACS_INT nbytes;
589 {
590 const unsigned char *endp = ptr + nbytes;
591 int chars = 0;
592
593 while (ptr < endp)
594 {
595 int len = MULTIBYTE_LENGTH (ptr, endp);
596
597 if (len == 0)
598 abort ();
599 ptr += len;
600 chars++;
601 }
602
603 return chars;
604 }
605
606 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
607 characters and bytes in it, and store them in *NCHARS and *NBYTES
608 respectively. On counting bytes, pay attention to that 8-bit
609 characters not constructing a valid multibyte sequence are
610 represented by 2-byte in a multibyte text. */
611
612 void
613 parse_str_as_multibyte (str, len, nchars, nbytes)
614 const unsigned char *str;
615 int len, *nchars, *nbytes;
616 {
617 const unsigned char *endp = str + len;
618 int n, chars = 0, bytes = 0;
619
620 if (len >= MAX_MULTIBYTE_LENGTH)
621 {
622 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
623 while (str < adjusted_endp)
624 {
625 if (! CHAR_BYTE8_HEAD_P (*str)
626 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
627 str += n, bytes += n;
628 else
629 str++, bytes += 2;
630 chars++;
631 }
632 }
633 while (str < endp)
634 {
635 if (! CHAR_BYTE8_HEAD_P (*str)
636 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
637 str += n, bytes += n;
638 else
639 str++, bytes += 2;
640 chars++;
641 }
642
643 *nchars = chars;
644 *nbytes = bytes;
645 return;
646 }
647
648 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
649 It actually converts only such 8-bit characters that don't contruct
650 a multibyte sequence to multibyte forms of Latin-1 characters. If
651 NCHARS is nonzero, set *NCHARS to the number of characters in the
652 text. It is assured that we can use LEN bytes at STR as a work
653 area and that is enough. Return the number of bytes of the
654 resulting text. */
655
656 int
657 str_as_multibyte (str, len, nbytes, nchars)
658 unsigned char *str;
659 int len, nbytes, *nchars;
660 {
661 unsigned char *p = str, *endp = str + nbytes;
662 unsigned char *to;
663 int chars = 0;
664 int n;
665
666 if (nbytes >= MAX_MULTIBYTE_LENGTH)
667 {
668 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
669 while (p < adjusted_endp
670 && ! CHAR_BYTE8_HEAD_P (*p)
671 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
672 p += n, chars++;
673 }
674 while (p < endp
675 && ! CHAR_BYTE8_HEAD_P (*p)
676 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
677 p += n, chars++;
678 if (nchars)
679 *nchars = chars;
680 if (p == endp)
681 return nbytes;
682
683 to = p;
684 nbytes = endp - p;
685 endp = str + len;
686 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
687 p = endp - nbytes;
688
689 if (nbytes >= MAX_MULTIBYTE_LENGTH)
690 {
691 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
692 while (p < adjusted_endp)
693 {
694 if (! CHAR_BYTE8_HEAD_P (*p)
695 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
696 {
697 while (n--)
698 *to++ = *p++;
699 }
700 else
701 {
702 int c = *p++;
703 c = BYTE8_TO_CHAR (c);
704 to += CHAR_STRING (c, to);
705 }
706 }
707 chars++;
708 }
709 while (p < endp)
710 {
711 if (! CHAR_BYTE8_HEAD_P (*p)
712 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
713 {
714 while (n--)
715 *to++ = *p++;
716 }
717 else
718 {
719 int c = *p++;
720 c = BYTE8_TO_CHAR (c);
721 to += CHAR_STRING (c, to);
722 }
723 chars++;
724 }
725 if (nchars)
726 *nchars = chars;
727 return (to - str);
728 }
729
730 /* Parse unibyte string at STR of LEN bytes, and return the number of
731 bytes it may ocupy when converted to multibyte string by
732 `str_to_multibyte'. */
733
734 int
735 parse_str_to_multibyte (str, len)
736 unsigned char *str;
737 int len;
738 {
739 unsigned char *endp = str + len;
740 int bytes;
741
742 for (bytes = 0; str < endp; str++)
743 bytes += (*str < 0x80) ? 1 : 2;
744 return bytes;
745 }
746
747
748 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
749 that contains the same single-byte characters. It actually
750 converts all 8-bit characters to multibyte forms. It is assured
751 that we can use LEN bytes at STR as a work area and that is
752 enough. */
753
754 int
755 str_to_multibyte (str, len, bytes)
756 unsigned char *str;
757 int len, bytes;
758 {
759 unsigned char *p = str, *endp = str + bytes;
760 unsigned char *to;
761
762 while (p < endp && *p < 0x80) p++;
763 if (p == endp)
764 return bytes;
765 to = p;
766 bytes = endp - p;
767 endp = str + len;
768 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
769 p = endp - bytes;
770 while (p < endp)
771 {
772 int c = *p++;
773
774 if (c >= 0x80)
775 c = BYTE8_TO_CHAR (c);
776 to += CHAR_STRING (c, to);
777 }
778 return (to - str);
779 }
780
781 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
782 actually converts characters in the range 0x80..0xFF to
783 unibyte. */
784
785 int
786 str_as_unibyte (str, bytes)
787 unsigned char *str;
788 int bytes;
789 {
790 const unsigned char *p = str, *endp = str + bytes;
791 unsigned char *to;
792 int c, len;
793
794 while (p < endp)
795 {
796 c = *p;
797 len = BYTES_BY_CHAR_HEAD (c);
798 if (CHAR_BYTE8_HEAD_P (c))
799 break;
800 p += len;
801 }
802 to = str + (p - str);
803 while (p < endp)
804 {
805 c = *p;
806 len = BYTES_BY_CHAR_HEAD (c);
807 if (CHAR_BYTE8_HEAD_P (c))
808 {
809 c = STRING_CHAR_ADVANCE (p);
810 *to++ = CHAR_TO_BYTE8 (c);
811 }
812 else
813 {
814 while (len--) *to++ = *p++;
815 }
816 }
817 return (to - str);
818 }
819
820 /* Convert eight-bit chars in SRC (in multibyte form) to the
821 corresponding byte and store in DST. CHARS is the number of
822 characters in SRC. The value is the number of bytes stored in DST.
823 Usually, the value is the same as CHARS, but is less than it if SRC
824 contains a non-ASCII, non-eight-bit character. If ACCEPT_LATIN_1
825 is nonzero, a Latin-1 character is accepted and converted to a byte
826 of that character code.
827 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
828
829 EMACS_INT
830 str_to_unibyte (src, dst, chars, accept_latin_1)
831 const unsigned char *src;
832 unsigned char *dst;
833 EMACS_INT chars;
834 int accept_latin_1;
835 {
836 EMACS_INT i;
837
838 for (i = 0; i < chars; i++)
839 {
840 int c = STRING_CHAR_ADVANCE (src);
841
842 if (CHAR_BYTE8_P (c))
843 c = CHAR_TO_BYTE8 (c);
844 else if (! ASCII_CHAR_P (c)
845 && (! accept_latin_1 || c >= 0x100))
846 return i;
847 *dst++ = c;
848 }
849 return i;
850 }
851
852
853 int
854 string_count_byte8 (string)
855 Lisp_Object string;
856 {
857 int multibyte = STRING_MULTIBYTE (string);
858 int nbytes = SBYTES (string);
859 unsigned char *p = SDATA (string);
860 unsigned char *pend = p + nbytes;
861 int count = 0;
862 int c, len;
863
864 if (multibyte)
865 while (p < pend)
866 {
867 c = *p;
868 len = BYTES_BY_CHAR_HEAD (c);
869
870 if (CHAR_BYTE8_HEAD_P (c))
871 count++;
872 p += len;
873 }
874 else
875 while (p < pend)
876 {
877 if (*p++ >= 0x80)
878 count++;
879 }
880 return count;
881 }
882
883
884 Lisp_Object
885 string_escape_byte8 (string)
886 Lisp_Object string;
887 {
888 int nchars = SCHARS (string);
889 int nbytes = SBYTES (string);
890 int multibyte = STRING_MULTIBYTE (string);
891 int byte8_count;
892 const unsigned char *src, *src_end;
893 unsigned char *dst;
894 Lisp_Object val;
895 int c, len;
896
897 if (multibyte && nchars == nbytes)
898 return string;
899
900 byte8_count = string_count_byte8 (string);
901
902 if (byte8_count == 0)
903 return string;
904
905 if (multibyte)
906 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
907 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
908 nbytes + byte8_count * 2);
909 else
910 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
911 val = make_uninit_string (nbytes + byte8_count * 3);
912
913 src = SDATA (string);
914 src_end = src + nbytes;
915 dst = SDATA (val);
916 if (multibyte)
917 while (src < src_end)
918 {
919 c = *src;
920 len = BYTES_BY_CHAR_HEAD (c);
921
922 if (CHAR_BYTE8_HEAD_P (c))
923 {
924 c = STRING_CHAR_ADVANCE (src);
925 c = CHAR_TO_BYTE8 (c);
926 sprintf ((char *) dst, "\\%03o", c);
927 dst += 4;
928 }
929 else
930 while (len--) *dst++ = *src++;
931 }
932 else
933 while (src < src_end)
934 {
935 c = *src++;
936 if (c >= 0x80)
937 {
938 sprintf ((char *) dst, "\\%03o", c);
939 dst += 4;
940 }
941 else
942 *dst++ = c;
943 }
944 return val;
945 }
946
947 \f
948 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
949 doc: /*
950 Concatenate all the argument characters and make the result a string.
951 usage: (string &rest CHARACTERS) */)
952 (n, args)
953 int n;
954 Lisp_Object *args;
955 {
956 int i, c;
957 unsigned char *buf, *p;
958 Lisp_Object str;
959 USE_SAFE_ALLOCA;
960
961 SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
962 p = buf;
963
964 for (i = 0; i < n; i++)
965 {
966 CHECK_CHARACTER (args[i]);
967 c = XINT (args[i]);
968 p += CHAR_STRING (c, p);
969 }
970
971 str = make_string_from_bytes ((char *) buf, n, p - buf);
972 SAFE_FREE ();
973 return str;
974 }
975
976 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
977 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
978 usage: (unibyte-string &rest BYTES) */)
979 (n, args)
980 int n;
981 Lisp_Object *args;
982 {
983 int i, c;
984 unsigned char *buf, *p;
985 Lisp_Object str;
986 USE_SAFE_ALLOCA;
987
988 SAFE_ALLOCA (buf, unsigned char *, n);
989 p = buf;
990
991 for (i = 0; i < n; i++)
992 {
993 CHECK_NATNUM (args[i]);
994 c = XFASTINT (args[i]);
995 if (c >= 256)
996 args_out_of_range_3 (args[i], make_number (0), make_number (255));
997 *p++ = c;
998 }
999
1000 str = make_string_from_bytes ((char *) buf, n, p - buf);
1001 SAFE_FREE ();
1002 return str;
1003 }
1004
1005 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
1006 Schar_resolve_modifiers, 1, 1, 0,
1007 doc: /* Resolve modifiers in the character CHAR.
1008 The value is a character with modifiers resolved into the character
1009 code. Unresolved modifiers are kept in the value.
1010 usage: (char-resolve-modifiers CHAR) */)
1011 (character)
1012 Lisp_Object character;
1013 {
1014 int c;
1015
1016 CHECK_NUMBER (character);
1017 c = XINT (character);
1018 return make_number (char_resolve_modifier_mask (c));
1019 }
1020
1021 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1022 doc: /* Return a byte value of a character at point.
1023 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1024 a byte value.
1025 Optional 2nd arg STRING, if non-nil, is a string of which first
1026 character is a target to get a byte value. In this case, POSITION, if
1027 non-nil, is an index of a target character in the string.
1028
1029 If the current buffer (or STRING) is multibyte, and the target
1030 character is not ASCII nor 8-bit character, an error is signalled. */)
1031 (position, string)
1032 Lisp_Object position, string;
1033 {
1034 int c;
1035 EMACS_INT pos;
1036 unsigned char *p;
1037
1038 if (NILP (string))
1039 {
1040 if (NILP (position))
1041 {
1042 p = PT_ADDR;
1043 }
1044 else
1045 {
1046 CHECK_NUMBER_COERCE_MARKER (position);
1047 if (XINT (position) < BEGV || XINT (position) >= ZV)
1048 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1049 pos = XFASTINT (position);
1050 p = CHAR_POS_ADDR (pos);
1051 }
1052 if (NILP (current_buffer->enable_multibyte_characters))
1053 return make_number (*p);
1054 }
1055 else
1056 {
1057 CHECK_STRING (string);
1058 if (NILP (position))
1059 {
1060 p = SDATA (string);
1061 }
1062 else
1063 {
1064 CHECK_NATNUM (position);
1065 if (XINT (position) >= SCHARS (string))
1066 args_out_of_range (string, position);
1067 pos = XFASTINT (position);
1068 p = SDATA (string) + string_char_to_byte (string, pos);
1069 }
1070 if (! STRING_MULTIBYTE (string))
1071 return make_number (*p);
1072 }
1073 c = STRING_CHAR (p);
1074 if (CHAR_BYTE8_P (c))
1075 c = CHAR_TO_BYTE8 (c);
1076 else if (! ASCII_CHAR_P (c))
1077 error ("Not an ASCII nor an 8-bit character: %d", c);
1078 return make_number (c);
1079 }
1080
1081
1082 void
1083 init_character_once ()
1084 {
1085 }
1086
1087 #ifdef emacs
1088
1089 void
1090 syms_of_character ()
1091 {
1092 DEFSYM (Qcharacterp, "characterp");
1093 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1094
1095 staticpro (&Vchar_unify_table);
1096 Vchar_unify_table = Qnil;
1097
1098 defsubr (&Smax_char);
1099 defsubr (&Scharacterp);
1100 defsubr (&Sunibyte_char_to_multibyte);
1101 defsubr (&Smultibyte_char_to_unibyte);
1102 defsubr (&Schar_bytes);
1103 defsubr (&Schar_width);
1104 defsubr (&Sstring_width);
1105 defsubr (&Schar_direction);
1106 defsubr (&Sstring);
1107 defsubr (&Sunibyte_string);
1108 defsubr (&Schar_resolve_modifiers);
1109 defsubr (&Sget_byte);
1110
1111 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1112 doc: /*
1113 Vector recording all translation tables ever defined.
1114 Each element is a pair (SYMBOL . TABLE) relating the table to the
1115 symbol naming it. The ID of a translation table is an index into this vector. */);
1116 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1117
1118 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1119 doc: /*
1120 A char-table for characters which invoke auto-filling.
1121 Such characters have value t in this table. */);
1122 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1123 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1124 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1125
1126 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1127 doc: /*
1128 A char-table for width (columns) of each character. */);
1129 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1130 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1131 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1132 make_number (4));
1133
1134 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1135 doc: /* A char-table for direction of each character. */);
1136 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1137
1138 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1139 doc: /* A char-table for each printable character. */);
1140 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1141 Fset_char_table_range (Vprintable_chars,
1142 Fcons (make_number (32), make_number (126)), Qt);
1143 Fset_char_table_range (Vprintable_chars,
1144 Fcons (make_number (160),
1145 make_number (MAX_5_BYTE_CHAR)), Qt);
1146
1147 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1148 doc: /* Char table of script symbols.
1149 It has one extra slot whose value is a list of script symbols. */);
1150
1151 /* Intern this now in case it isn't already done.
1152 Setting this variable twice is harmless.
1153 But don't staticpro it here--that is done in alloc.c. */
1154 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1155 DEFSYM (Qchar_script_table, "char-script-table");
1156 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1157 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1158
1159 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1160 doc: /* Alist of scripts vs the representative characters.
1161 Each element is a cons (SCRIPT . CHARS).
1162 SCRIPT is a symbol representing a script or a subgroup of a script.
1163 CHARS is a list or a vector of characters.
1164 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1165 If it is a vector, one of the characters in the vector is necessary.
1166 This variable is used to find a font for a specific script. */);
1167 Vscript_representative_chars = Qnil;
1168
1169 DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1170 doc: /* Char table of Unicode's "General Category".
1171 All Unicode characters have one of the following values (symbol):
1172 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1173 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1174 See The Unicode Standard for the meaning of those values. */);
1175 /* The correct char-table is setup in characters.el. */
1176 Vunicode_category_table = Qnil;
1177 }
1178
1179 #endif /* emacs */
1180
1181 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1182 (do not change this comment) */