]> code.delx.au - gnu-emacs/blobdiff - src/regex.c
(Fx_create_frame): Add debugging code.
[gnu-emacs] / src / regex.c
index 0c1343bf58431143d8f1e3e8df7a1abaa794d858..4f2683adfb973bcf9a2b595a49efd667434ca743 100644 (file)
@@ -2,7 +2,8 @@
    0.12.  (Implements POSIX draft P1003.2/D11.2, except for some of the
    internationalization features.)
 
-   Copyright (C) 1993,94,95,96,97,98,99,2000,04  Free Software Foundation, Inc.
+   Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+                 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -16,7 +17,7 @@
 
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
    USA.         */
 
 /* TODO:
@@ -1096,13 +1097,14 @@ print_partial_compiled_pattern (start, end)
 
        case wordend:
          fprintf (stderr, "/wordend");
+         break;
 
        case symbeg:
-         printf ("/symbeg");
+         fprintf (stderr, "/symbeg");
          break;
 
        case symend:
-         printf ("/symend");
+         fprintf (stderr, "/symend");
          break;
 
        case syntaxspec:
@@ -1249,7 +1251,7 @@ reg_syntax_t re_syntax_options;
 
 reg_syntax_t
 re_set_syntax (syntax)
-    reg_syntax_t syntax;
+     reg_syntax_t syntax;
 {
   reg_syntax_t ret = re_syntax_options;
 
@@ -1257,6 +1259,17 @@ re_set_syntax (syntax)
   return ret;
 }
 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
+
+/* Regexp to use to replace spaces, or NULL meaning don't.  */
+static re_char *whitespace_regexp;
+
+void
+re_set_whitespace_regexp (regexp)
+     const char *regexp;
+{
+  whitespace_regexp = (re_char *) regexp;
+}
+WEAK_ALIAS (__re_set_syntax, re_set_syntax)
 \f
 /* This table gives an error message for each of the error codes listed
    in regex.h.  Obviously the order here has to be same as there.
@@ -1282,6 +1295,7 @@ static const char *re_error_msgid[] =
     gettext_noop ("Premature end of regular expression"), /* REG_EEND */
     gettext_noop ("Regular expression too big"), /* REG_ESIZE */
     gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
+    gettext_noop ("Range striding over charsets") /* REG_ERANGEX  */
   };
 \f
 /* Avoiding alloca during matching, to placate r_alloc.  */
@@ -1937,64 +1951,32 @@ struct range_table_work_area
 
 /* Get the next unsigned number in the uncompiled pattern.  */
 #define GET_UNSIGNED_NUMBER(num)                                       \
- do { if (p != pend)                                                   \
-     {                                                                 \
-       PATFETCH (c);                                                   \
-       if (c == ' ')                                                   \
-        FREE_STACK_RETURN (REG_BADBR);                                 \
-       while ('0' <= c && c <= '9')                                    \
-        {                                                              \
-           int prev;                                                   \
-          if (num < 0)                                                 \
-            num = 0;                                                   \
-          prev = num;                                                  \
-          num = num * 10 + c - '0';                                    \
-          if (num / 10 != prev)                                        \
-            FREE_STACK_RETURN (REG_BADBR);                             \
-          if (p == pend)                                               \
-            break;                                                     \
-          PATFETCH (c);                                                \
-        }                                                              \
-       if (c == ' ')                                                   \
-        FREE_STACK_RETURN (REG_BADBR);                                 \
-       }                                                               \
-    } while (0)
+  do {                                                                 \
+    if (p == pend)                                                     \
+      FREE_STACK_RETURN (REG_EBRACE);                                  \
+    else                                                               \
+      {                                                                        \
+       PATFETCH (c);                                                   \
+       while ('0' <= c && c <= '9')                                    \
+         {                                                             \
+           int prev;                                                   \
+           if (num < 0)                                                \
+             num = 0;                                                  \
+           prev = num;                                                 \
+           num = num * 10 + c - '0';                                   \
+           if (num / 10 != prev)                                       \
+             FREE_STACK_RETURN (REG_BADBR);                            \
+           if (p == pend)                                              \
+             FREE_STACK_RETURN (REG_EBRACE);                           \
+           PATFETCH (c);                                               \
+         }                                                             \
+      }                                                                        \
+  } while (0)
 \f
-#if WIDE_CHAR_SUPPORT
-/* The GNU C library provides support for user-defined character classes
-   and the functions from ISO C amendement 1.  */
-# ifdef CHARCLASS_NAME_MAX
-#  define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
-# else
-/* This shouldn't happen but some implementation might still have this
-   problem.  Use a reasonable default value.  */
-#  define CHAR_CLASS_MAX_LENGTH 256
-# endif
-typedef wctype_t re_wctype_t;
-typedef wchar_t re_wchar_t;
-# define re_wctype wctype
-# define re_iswctype iswctype
-# define re_wctype_to_bit(cc) 0
-#else
-# define CHAR_CLASS_MAX_LENGTH  9 /* Namely, `multibyte'.  */
-# define btowc(c) c
-
-/* Character classes.  */
-typedef enum { RECC_ERROR = 0,
-              RECC_ALNUM, RECC_ALPHA, RECC_WORD,
-              RECC_GRAPH, RECC_PRINT,
-              RECC_LOWER, RECC_UPPER,
-              RECC_PUNCT, RECC_CNTRL,
-              RECC_DIGIT, RECC_XDIGIT,
-              RECC_BLANK, RECC_SPACE,
-              RECC_MULTIBYTE, RECC_NONASCII,
-              RECC_ASCII, RECC_UNIBYTE
-} re_wctype_t;
-
-typedef int re_wchar_t;
+#if ! WIDE_CHAR_SUPPORT
 
 /* Map a string to the char class it names (if any).  */
-static re_wctype_t
+re_wctype_t
 re_wctype (str)
      re_char *str;
 {
@@ -2020,7 +2002,7 @@ re_wctype (str)
 }
 
 /* True iff CH is in the char class CC.  */
-static boolean
+boolean
 re_iswctype (ch, cc)
      int ch;
      re_wctype_t cc;
@@ -2465,6 +2447,15 @@ regex_compile (pattern, size, syntax, bufp)
   /* If the object matched can contain multibyte characters.  */
   const boolean multibyte = RE_MULTIBYTE_P (bufp);
 
+  /* Nonzero if we have pushed down into a subpattern.  */
+  int in_subpattern = 0;
+
+  /* These hold the values of p, pattern, and pend from the main
+     pattern when we have pushed into a subpattern.  */
+  re_char *main_p;
+  re_char *main_pattern;
+  re_char *main_pend;
+
 #ifdef DEBUG
   debug++;
   DEBUG_PRINT1 ("\nCompiling pattern: ");
@@ -2527,12 +2518,61 @@ regex_compile (pattern, size, syntax, bufp)
   begalt = b = bufp->buffer;
 
   /* Loop through the uncompiled pattern until we're at the end.  */
-  while (p != pend)
+  while (1)
     {
+      if (p == pend)
+       {
+         /* If this is the end of an included regexp,
+            pop back to the main regexp and try again.  */
+         if (in_subpattern)
+           {
+             in_subpattern = 0;
+             pattern = main_pattern;
+             p = main_p;
+             pend = main_pend;
+             continue;
+           }
+         /* If this is the end of the main regexp, we are done.  */
+         break;
+       }
+
       PATFETCH (c);
 
       switch (c)
        {
+       case ' ':
+         {
+           re_char *p1 = p;
+
+           /* If there's no special whitespace regexp, treat
+              spaces normally.  And don't try to do this recursively.  */
+           if (!whitespace_regexp || in_subpattern)
+             goto normal_char;
+
+           /* Peek past following spaces.  */
+           while (p1 != pend)
+             {
+               if (*p1 != ' ')
+                 break;
+               p1++;
+             }
+           /* If the spaces are followed by a repetition op,
+              treat them normally.  */
+           if (p1 != pend
+               && (*p1 == '*' || *p1 == '+' || *p1 == '?'
+                   || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
+             goto normal_char;
+
+           /* Replace the spaces with the whitespace regexp.  */
+           in_subpattern = 1;
+           main_p = p1;
+           main_pend = pend;
+           main_pattern = pattern;
+           p = pattern = whitespace_regexp;
+           pend = p + strlen (p);
+           break;
+         }    
+
        case '^':
          {
            if (   /* If at start of pattern, it's an operator.  */
@@ -2901,7 +2941,7 @@ regex_compile (pattern, size, syntax, bufp)
                          }
                      }
                    else if (!SAME_CHARSET_P (c, c1))
-                     FREE_STACK_RETURN (REG_ERANGE);
+                     FREE_STACK_RETURN (REG_ERANGEX);
                  }
                else
                  /* Range from C to C. */
@@ -3194,9 +3234,6 @@ regex_compile (pattern, size, syntax, bufp)
 
                beg_interval = p;
 
-               if (p == pend)
-                 FREE_STACK_RETURN (REG_EBRACE);
-
                GET_UNSIGNED_NUMBER (lower_bound);
 
                if (c == ',')
@@ -3213,7 +3250,8 @@ regex_compile (pattern, size, syntax, bufp)
                  {
                    if (c != '\\')
                      FREE_STACK_RETURN (REG_BADBR);
-
+                   if (p == pend)
+                     FREE_STACK_RETURN (REG_EESCAPE);
                    PATFETCH (c);
                  }
 
@@ -4253,12 +4291,19 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
                        d += buf_charlen;
                      }
                  else
-                   while (range > lim
-                          && !fastmap[RE_TRANSLATE (translate, *d)])
-                     {
-                       d++;
-                       range--;
-                     }
+                   {
+                     /* Convert *d to integer to shut up GCC's
+                        whining about comparison that is always
+                        true.  */
+                     int di = *d;
+
+                     while (range > lim
+                            && !fastmap[RE_TRANSLATE (translate, di)])
+                       {
+                         di = *(++d);
+                         range--;
+                       }
+                   }
                }
              else
                while (range > lim && !fastmap[*d])
@@ -5207,8 +5252,13 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
              else
                do
                  {
+                   /* Avoid compiler whining about comparison being
+                      always true.  */
+                   int di;
+
                    PREFETCH ();
-                   if (RE_TRANSLATE (translate, *d) != *p++)
+                   di = *d;
+                   if (RE_TRANSLATE (translate, di) != *p++)
                      {
                        d = dfail;
                        goto fail;
@@ -5823,7 +5873,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
                  PREFETCH_NOLIMIT ();
                  c2 = RE_STRING_CHAR (d, dend - d);
 #ifdef emacs
-                 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+                 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
 #endif
                  s2 = SYNTAX (c2);
 
@@ -5910,7 +5960,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
                  PREFETCH_NOLIMIT ();
                  c2 = RE_STRING_CHAR (d, dend - d);
 #ifdef emacs
-                 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+                 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
 #endif
                  s2 = SYNTAX (c2);