X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/d85d38392e338f66053a6a6f1017720660239338..76b6f7075970e492eba3cf3f4411fcfc4ff3bdcd:/src/character.c

diff --git a/src/character.c b/src/character.c
index 29a7f80ae4..119502a978 100644
--- a/src/character.c
+++ b/src/character.c
@@ -1,17 +1,18 @@
 /* Basic character support.
    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
      Licensed to the Free Software Foundation.
-   Copyright (C) 2001, 2005, 2006 Free Software Foundation, Inc.
-   Copyright (C) 2003, 2006
+   Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+     Free Software Foundation, Inc.
+   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009
      National Institute of Advanced Industrial Science and Technology (AIST)
      Registration Number H13PRO009
 
 This file is part of GNU Emacs.
 
-GNU Emacs is free software; you can redistribute it and/or modify
+GNU Emacs is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
 
 GNU Emacs is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -19,9 +20,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 
 You should have received a copy of the GNU General Public License
-along with GNU Emacs; see the file COPYING.  If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
+along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 
 /* At first, see the document in `character.h' to understand the code
    in this file.  */
@@ -60,7 +59,7 @@ Lisp_Object Vauto_fill_chars;
 Lisp_Object Qauto_fill_chars;
 
 /* Char-table of information about which character to unify to which
-   Unicode character.  */
+   Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
 Lisp_Object Vchar_unify_table;
 
 /* A char-table.  An element is non-nil iff the corresponding
@@ -86,6 +85,8 @@ Lisp_Object Vscript_representative_chars;
 
 static Lisp_Object Qchar_script_table;
 
+Lisp_Object Vunicode_category_table;
+
 /* Mapping table from unibyte chars to multibyte chars.  */
 int unibyte_to_multibyte_table[256];
 
@@ -95,53 +96,67 @@ char unibyte_has_multibyte_table[256];
 
 
 
+/* If character code C has modifier masks, reflect them to the
+   character code if possible.  Return the resulting code.  */
+
+int
+char_resolve_modifier_mask (c)
+     int c;
+{
+  /* A non-ASCII character can't reflect modifier bits to the code.  */
+  if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
+    return c;
+
+  /* For Meta, Shift, and Control modifiers, we need special care.  */
+  if (c & CHAR_SHIFT)
+    {
+      /* Shift modifier is valid only with [A-Za-z].  */
+      if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
+	c &= ~CHAR_SHIFT;
+      else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
+	c = (c & ~CHAR_SHIFT) - ('a' - 'A');
+      /* Shift modifier for control characters and SPC is ignored.  */
+      else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
+	c &= ~CHAR_SHIFT;
+    }
+  if (c & CHAR_CTL)
+    {
+      /* Simulate the code in lread.c.  */
+      /* Allow `\C- ' and `\C-?'.  */
+      if ((c & 0377) == ' ')
+	c &= ~0177 & ~ CHAR_CTL;
+      else if ((c & 0377) == '?')
+	c = 0177 | (c & ~0177 & ~CHAR_CTL);
+      /* ASCII control chars are made from letters (both cases),
+	 as well as the non-letters within 0100...0137.  */
+      else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
+	c &= (037 | (~0177 & ~CHAR_CTL));
+      else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
+	c &= (037 | (~0177 & ~CHAR_CTL));
+    }
+  if (c & CHAR_META)
+    {
+      /* Move the meta bit to the right place for a string.  */
+      c = (c & ~CHAR_META) | 0x80;
+    }
+
+  return c;
+}
+
+
 /* Store multibyte form of character C at P.  If C has modifier bits,
    handle them appropriately.  */
 
 int
 char_string (c, p)
-     int c;
+     unsigned c;
      unsigned char *p;
 {
   int bytes;
 
   if (c & CHAR_MODIFIER_MASK)
     {
-      /* As an non-ASCII character can't have modifier bits, we just
-	 ignore the bits.  */
-      if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
-	{
-	  /* For Meta, Shift, and Control modifiers, we need special care.  */
-	  if (c & CHAR_META)
-	    {
-	      /* Move the meta bit to the right place for a string.  */
-	      c = (c & ~CHAR_META) | 0x80;
-	    }
-	  if (c & CHAR_SHIFT)
-	    {
-	      /* Shift modifier is valid only with [A-Za-z].  */
-	      if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
-		c &= ~CHAR_SHIFT;
-	      else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
-		c = (c & ~CHAR_SHIFT) - ('a' - 'A');
-	    }
-	  if (c & CHAR_CTL)
-	    {
-	      /* Simulate the code in lread.c.  */
-	      /* Allow `\C- ' and `\C-?'.  */
-	      if (c == (CHAR_CTL | ' '))
-		c = 0;
-	      else if (c == (CHAR_CTL | '?'))
-		c = 127;
-	      /* ASCII control chars are made from letters (both cases),
-		 as well as the non-letters within 0100...0137.  */
-	      else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
-		c &= (037 | (~0177 & ~CHAR_CTL));
-	      else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
-		c &= (037 | (~0177 & ~CHAR_CTL));
-	    }
-	}
-
+      c = (unsigned) char_resolve_modifier_mask ((int) c);
       /* If C still has any modifier bits, just ignore it.  */
       c &= ~CHAR_MODIFIER_MASK;
     }
@@ -169,11 +184,13 @@ char_string (c, p)
       p[4] = (0x80 | (c & 0x3F));
       bytes = 5;
     }
-  else
+  else if (c <= MAX_CHAR)
     {
       c = CHAR_TO_BYTE8 (c);
       bytes = BYTE8_STRING (c, p);
     }
+  else
+    error ("Invalid character: %d", c);
 
   return bytes;
 }
@@ -309,9 +326,7 @@ DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 
 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
        Sunibyte_char_to_multibyte, 1, 1, 0,
-       doc: /* Convert the unibyte character CH to multibyte character.
-The multibyte character is a result of decoding CH by
-the current unibyte charset (see `unibyte-charset').  */)
+       doc: /* Convert the byte CH to multibyte character.  */)
      (ch)
      Lisp_Object ch;
 {
@@ -331,23 +346,30 @@ the current unibyte charset (see `unibyte-charset').  */)
 
 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
        Smultibyte_char_to_unibyte, 1, 1, 0,
-       doc: /* Convert the multibyte character CH to unibyte character.\n\
-The unibyte character is a result of encoding CH by
-the current primary charset (value of `charset-primary').  */)
+       doc: /* Convert the multibyte character CH to a byte.
+If the multibyte character does not represent a byte, return -1.  */)
      (ch)
      Lisp_Object ch;
 {
-  int c;
+  int cm;
 
   CHECK_CHARACTER (ch);
-  c = XFASTINT (ch);
-  c = CHAR_TO_BYTE8 (c);
-  return make_number (c);
+  cm = XFASTINT (ch);
+  if (cm < 256)
+    /* Can't distinguish a byte read from a unibyte buffer from
+       a latin1 char, so let's let it slide.  */
+    return ch;
+  else
+    {
+      int cu = CHAR_TO_BYTE_SAFE (cm);
+      return make_number (cu);
+    }
 }
 
 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
        doc: /* Return 1 regardless of the argument CHAR.
-This is now an obsolete function.  We keep it just for backward compatibility.	 */)
+This is now an obsolete function.  We keep it just for backward compatibility.
+usage: (char-bytes CHAR)  */)
      (ch)
      Lisp_Object ch;
 {
@@ -358,7 +380,8 @@ This is now an obsolete function.  We keep it just for backward compatibility.
 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
        doc: /* Return width of CHAR when displayed in the current buffer.
 The width is measured by how many columns it occupies on the screen.
-Tab is taken to occupy `tab-width' columns.  */)
+Tab is taken to occupy `tab-width' columns.
+usage: (char-width CHAR)  */)
      (ch)
        Lisp_Object ch;
 {
@@ -388,9 +411,7 @@ Tab is taken to occupy `tab-width' columns.  */)
    respectively.  */
 
 int
-c_string_width (str, len, precision, nchars, nbytes)
-     const unsigned char *str;
-     int precision, *nchars, *nbytes;
+c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
 {
   int i = 0, i_byte = 0;
   int width = 0;
@@ -535,7 +556,8 @@ Width is measured by how many columns it occupies on the screen.
 When calculating width of a multibyte character in STRING,
 only the base leading-code is considered; the validity of
 the following bytes is not checked.  Tabs in STRING are always
-taken to occupy `tab-width' columns.  */)
+taken to occupy `tab-width' columns.
+usage: (string-width STRING)  */)
      (str)
      Lisp_Object str;
 {
@@ -548,7 +570,8 @@ taken to occupy `tab-width' columns.  */)
 
 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
        doc: /* Return the direction of CHAR.
-The returned value is 0 for left-to-right and 1 for right-to-left.  */)
+The returned value is 0 for left-to-right and 1 for right-to-left.
+usage: (char-direction CHAR)  */)
      (ch)
      Lisp_Object ch;
 {
@@ -565,10 +588,10 @@ The returned value is 0 for left-to-right and 1 for right-to-left.  */)
    However, if the current buffer has enable-multibyte-characters =
    nil, we treat each byte as a character.  */
 
-int
+EMACS_INT
 chars_in_text (ptr, nbytes)
      const unsigned char *ptr;
-     int nbytes;
+     EMACS_INT nbytes;
 {
   /* current_buffer is null at early stages of Emacs initialization.  */
   if (current_buffer == 0
@@ -583,10 +606,10 @@ chars_in_text (ptr, nbytes)
    sequences while assuming that there's no invalid sequence.  It
    ignores enable-multibyte-characters.  */
 
-int
+EMACS_INT
 multibyte_chars_in_text (ptr, nbytes)
      const unsigned char *ptr;
-     int nbytes;
+     EMACS_INT nbytes;
 {
   const unsigned char *endp = ptr + nbytes;
   int chars = 0;
@@ -811,6 +834,39 @@ str_as_unibyte (str, bytes)
   return (to - str);
 }
 
+/* Convert eight-bit chars in SRC (in multibyte form) to the
+   corresponding byte and store in DST.  CHARS is the number of
+   characters in SRC.  The value is the number of bytes stored in DST.
+   Usually, the value is the same as CHARS, but is less than it if SRC
+   contains a non-ASCII, non-eight-bit characater.  If ACCEPT_LATIN_1
+   is nonzero, a Latin-1 character is accepted and converted to a byte
+   of that character code.
+   Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
+
+EMACS_INT
+str_to_unibyte (src, dst, chars, accept_latin_1)
+     const unsigned char *src;
+     unsigned char *dst;
+     EMACS_INT chars;
+     int accept_latin_1;
+{
+  EMACS_INT i;
+
+  for (i = 0; i < chars; i++)
+    {
+      int c = STRING_CHAR_ADVANCE (src);
+
+      if (CHAR_BYTE8_P (c))
+	c = CHAR_TO_BYTE8 (c);
+      else if (! ASCII_CHAR_P (c)
+	       && (! accept_latin_1 || c >= 0x100))
+	return i;
+      *dst++ = c;
+    }
+  return i;
+}
+
+
 int
 string_count_byte8 (string)
      Lisp_Object string;
@@ -929,6 +985,107 @@ usage: (string &rest CHARACTERS)  */)
   return make_string_from_bytes ((char *) buf, n, p - buf);
 }
 
+DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
+       doc: /* Concatenate all the argument bytes and make the result a unibyte string.
+usage: (unibyte-string &rest BYTES)  */)
+     (n, args)
+     int n;
+     Lisp_Object *args;
+{
+  int i;
+  unsigned char *buf = (unsigned char *) alloca (n);
+  unsigned char *p = buf;
+  unsigned c;
+
+  for (i = 0; i < n; i++)
+    {
+      CHECK_NATNUM (args[i]);
+      c = XFASTINT (args[i]);
+      if (c >= 256)
+	args_out_of_range_3 (args[i], make_number (0), make_number (255));
+      *p++ = c;
+    }
+
+  return make_string_from_bytes ((char *) buf, n, p - buf);
+}
+
+DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
+       Schar_resolve_modifiers, 1, 1, 0,
+       doc: /* Resolve modifiers in the character CHAR.
+The value is a character with modifiers resolved into the character
+code.  Unresolved modifiers are kept in the value.
+usage: (char-resolve-modifiers CHAR)  */)
+     (character)
+     Lisp_Object character;
+{
+  int c;
+
+  CHECK_NUMBER (character);
+  c = XINT (character);
+  return make_number (char_resolve_modifier_mask (c));
+}
+
+DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
+       doc: /* Return a byte value of a character at point.
+Optional 1st arg POSITION, if non-nil, is a position of a character to get
+a byte value.
+Optional 2nd arg STRING, if non-nil, is a string of which first
+character is a target to get a byte value.  In this case, POSITION, if
+non-nil, is an index of a target character in the string.
+
+If the current buffer (or STRING) is multibyte, and the target
+character is not ASCII nor 8-bit character, an error is signalled.  */)
+     (position, string)
+     Lisp_Object position, string;
+{
+  int c;
+  EMACS_INT pos;
+  unsigned char *p;
+
+  if (NILP (string))
+    {
+      if (NILP (position))
+	{
+	  p = PT_ADDR;
+	}	  
+      else
+	{
+	  CHECK_NUMBER_COERCE_MARKER (position);
+	  if (XINT (position) < BEGV || XINT (position) >= ZV)
+	    args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
+	  pos = XFASTINT (position);
+	  p = CHAR_POS_ADDR (pos);
+	}
+      if (NILP (current_buffer->enable_multibyte_characters))
+	return make_number (*p);
+    }
+  else
+    {
+      CHECK_STRING (string);
+      if (NILP (position))
+	{
+	  p = SDATA (string);
+	}
+      else
+	{
+	  CHECK_NATNUM (position);
+	  if (XINT (position) >= SCHARS (string))
+	    args_out_of_range (string, position);
+	  pos = XFASTINT (position);
+	  p = SDATA (string) + string_char_to_byte (string, pos);
+	}
+      if (! STRING_MULTIBYTE (string))
+	return make_number (*p);
+    }
+  c = STRING_CHAR (p, 0);
+  if (CHAR_BYTE8_P (c))
+    c = CHAR_TO_BYTE8 (c);
+  else if (! ASCII_CHAR_P (c))
+    error ("Not an ASCII nor an 8-bit character: %d", c);
+  return make_number (c);
+}
+
+
 void
 init_character_once ()
 {
@@ -954,6 +1111,9 @@ syms_of_character ()
   defsubr (&Sstring_width);
   defsubr (&Schar_direction);
   defsubr (&Sstring);
+  defsubr (&Sunibyte_string);
+  defsubr (&Schar_resolve_modifiers);
+  defsubr (&Sget_byte);
 
   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
 	       doc: /*
@@ -1004,8 +1164,22 @@ It has one extra slot whose value is a list of script symbols.  */);
   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
 
   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
-	       doc: /* Alist of scripts vs the representative characters.  */);
+	       doc: /* Alist of scripts vs the representative characters.
+Each element is a cons (SCRIPT . CHARS), where SCRIPT is a script name symbol,
+CHARS is a list or a vector of characters.
+If it is a list, all characters in the list are necessary for supporting SCRIPT.
+If it is a vector, one of the characters in the vector is necessary.
+This variable is used to find a font for a specific script.  */);
   Vscript_representative_chars = Qnil;
+
+  DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
+	       doc: /* Char table of Unicode's "General Category".
+All Unicode characters have one of the following values (symbol):
+  Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
+  Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
+See The Unicode Standard for the meaning of those values.  */);
+  /* The correct char-table is setup in characters.el.  */
+  Vunicode_category_table = Qnil;
 }
 
 #endif /* emacs */