(BYTES_USED): Use uordblks, not arena.

[gnu-emacs] / src / regex.c
diff --git a/src/regex.c b/src/regex.c

index 43351b380de63a65d4f55bb7ca72524425d05093..4f2683adfb973bcf9a2b595a49efd667434ca743 100644 (file)
--- a/src/regex.c
+++ b/src/regex.c
@@ -2,7 +2,8 @@
     0.12.  (Implements POSIX draft P1003.2/D11.2, except for some of the
     internationalization features.)
  
-   Copyright (C) 1993,94,95,96,97,98,99,2000 Free Software Foundation, Inc.
+   Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+                 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
  
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -16,12 +17,10 @@
  
     You should have received a copy of the GNU General Public License
     along with this program; if not, write to the Free Software
-   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
     USA.         */
  
-/* BUGS:
-   - (x?)*y\1z should match both xxxxyxz and xxxyz.
-   TODO:
+/* TODO:
     - structure the opcode space into opcode+flag.
     - merge with glibc's regex.[ch].
     - replace (succeed_n + jump_n + set_number_at) with something that doesn't
@@ -35,9 +34,6 @@
    #pragma alloca
  #endif
  
-#undef _GNU_SOURCE
-#define _GNU_SOURCE
-
  #ifdef HAVE_CONFIG_H
  # include <config.h>
  #endif
@@ -162,8 +158,9 @@
         {                                                               \
          re_char *dtemp = (p) == (str2) ? (end1) : (p);                 \
          re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
-        while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp));             \
-        c = STRING_CHAR (dtemp, (p) - dtemp);                          \
+        re_char *d0 = dtemp;                                           \
+        PREV_CHAR_BOUNDARY (d0, dlimit);                               \
+        c = STRING_CHAR (d0, dtemp - d0);                              \
         }                                                               \
       else                                                              \
         (c = ((p) == (str2) ? (end1) : (p))[-1]);                       \
@@ -221,7 +218,7 @@ char *realloc ();
  /* Define the syntax stuff for \<, \>, etc.  */
  
  /* Sword must be nonzero for the wordchar pattern commands in re_match_2.  */
-enum syntaxcode { Swhitespace = 0, Sword = 1 };
+enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
  
  # ifdef SWITCH_ENUM_BUG
  #  define SWITCH_ENUM_CAST(x) ((int)(x))
@@ -240,6 +237,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1 };
  # define SINGLE_BYTE_CHAR_P(c) (1)
  # define SAME_CHARSET_P(c1, c2) (1)
  # define MULTIBYTE_FORM_LENGTH(p, s) (1)
+# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
  # define STRING_CHAR(p, s) (*(p))
  # define RE_STRING_CHAR STRING_CHAR
  # define CHAR_STRING(c, s) (*(s) = (c), 1)
@@ -401,7 +399,7 @@ init_syntax_once ()
       if (ISALNUM (c))
         re_syntax_table[c] = Sword;
  
-   re_syntax_table['_'] = Sword;
+   re_syntax_table['_'] = Ssymbol;
  
     done = 1;
  }
@@ -658,6 +656,9 @@ typedef enum
    wordbound,   /* Succeeds if at a word boundary.  */
    notwordbound,        /* Succeeds if not at a word boundary.  */
  
+  symbeg,       /* Succeeds if at symbol beginning.  */
+  symend,       /* Succeeds if at symbol end.  */
+
         /* Matches any character whose syntax is specified.  Followed by
            a byte which contains a syntax code, e.g., Sword.  */
    syntaxspec,
@@ -924,50 +925,49 @@ print_partial_compiled_pattern (start, end)
  
    if (start == NULL)
      {
-      printf ("(null)\n");
+      fprintf (stderr, "(null)\n");
        return;
      }
  
    /* Loop over pattern commands.  */
    while (p < pend)
      {
-      printf ("%d:\t", p - start);
+      fprintf (stderr, "%d:\t", p - start);
  
        switch ((re_opcode_t) *p++)
         {
         case no_op:
-         printf ("/no_op");
+         fprintf (stderr, "/no_op");
           break;
  
         case succeed:
-         printf ("/succeed");
+         fprintf (stderr, "/succeed");
           break;
  
         case exactn:
           mcnt = *p++;
-         printf ("/exactn/%d", mcnt);
+         fprintf (stderr, "/exactn/%d", mcnt);
           do
             {
-             putchar ('/');
-             putchar (*p++);
+             fprintf (stderr, "/%c", *p++);
             }
           while (--mcnt);
           break;
  
         case start_memory:
-         printf ("/start_memory/%d", *p++);
+         fprintf (stderr, "/start_memory/%d", *p++);
           break;
  
         case stop_memory:
-         printf ("/stop_memory/%d", *p++);
+         fprintf (stderr, "/stop_memory/%d", *p++);
           break;
  
         case duplicate:
-         printf ("/duplicate/%d", *p++);
+         fprintf (stderr, "/duplicate/%d", *p++);
           break;
  
         case anychar:
-         printf ("/anychar");
+         fprintf (stderr, "/anychar");
           break;
  
         case charset:
@@ -978,10 +978,11 @@ print_partial_compiled_pattern (start, end)
             int length = CHARSET_BITMAP_SIZE (p - 1);
             int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
  
-           printf ("/charset [%s",
-                   (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
+           fprintf (stderr, "/charset [%s",
+                    (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
  
-           assert (p + *p < pend);
+           if (p + *p >= pend)
+             fprintf (stderr, " !extends past end of pattern! ");
  
             for (c = 0; c < 256; c++)
               if (c / 8 < length
@@ -990,33 +991,33 @@ print_partial_compiled_pattern (start, end)
                   /* Are we starting a range?  */
                   if (last + 1 == c && ! in_range)
                     {
-                     putchar ('-');
+                     fprintf (stderr, "-");
                       in_range = 1;
                     }
                   /* Have we broken a range?  */
                   else if (last + 1 != c && in_range)
                     {
-                     putchar (last);
+                     fprintf (stderr, "%c", last);
                       in_range = 0;
                     }
  
                   if (! in_range)
-                   putchar (c);
+                   fprintf (stderr, "%c", c);
  
                   last = c;
               }
  
             if (in_range)
-             putchar (last);
+             fprintf (stderr, "%c", last);
  
-           putchar (']');
+           fprintf (stderr, "]");
  
             p += 1 + length;
  
             if (has_range_table)
               {
                 int count;
-               printf ("has-range-table");
+               fprintf (stderr, "has-range-table");
  
                 /* ??? Should print the range table; for now, just skip it.  */
                 p += 2;         /* skip range table bits */
@@ -1027,130 +1028,139 @@ print_partial_compiled_pattern (start, end)
           break;
  
         case begline:
-         printf ("/begline");
+         fprintf (stderr, "/begline");
           break;
  
         case endline:
-         printf ("/endline");
+         fprintf (stderr, "/endline");
           break;
  
         case on_failure_jump:
           extract_number_and_incr (&mcnt, &p);
-         printf ("/on_failure_jump to %d", p + mcnt - start);
+         fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
           break;
  
         case on_failure_keep_string_jump:
           extract_number_and_incr (&mcnt, &p);
-         printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
+         fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
           break;
  
         case on_failure_jump_nastyloop:
           extract_number_and_incr (&mcnt, &p);
-         printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start);
+         fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
           break;
  
         case on_failure_jump_loop:
           extract_number_and_incr (&mcnt, &p);
-         printf ("/on_failure_jump_loop to %d", p + mcnt - start);
+         fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
           break;
  
         case on_failure_jump_smart:
           extract_number_and_incr (&mcnt, &p);
-         printf ("/on_failure_jump_smart to %d", p + mcnt - start);
+         fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
           break;
  
         case jump:
           extract_number_and_incr (&mcnt, &p);
-         printf ("/jump to %d", p + mcnt - start);
+         fprintf (stderr, "/jump to %d", p + mcnt - start);
           break;
  
         case succeed_n:
           extract_number_and_incr (&mcnt, &p);
           extract_number_and_incr (&mcnt2, &p);
-         printf ("/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
+         fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
           break;
  
         case jump_n:
           extract_number_and_incr (&mcnt, &p);
           extract_number_and_incr (&mcnt2, &p);
-         printf ("/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
+         fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
           break;
  
         case set_number_at:
           extract_number_and_incr (&mcnt, &p);
           extract_number_and_incr (&mcnt2, &p);
-         printf ("/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
+         fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
           break;
  
         case wordbound:
-         printf ("/wordbound");
+         fprintf (stderr, "/wordbound");
           break;
  
         case notwordbound:
-         printf ("/notwordbound");
+         fprintf (stderr, "/notwordbound");
           break;
  
         case wordbeg:
-         printf ("/wordbeg");
+         fprintf (stderr, "/wordbeg");
           break;
  
         case wordend:
-         printf ("/wordend");
+         fprintf (stderr, "/wordend");
+         break;
+
+       case symbeg:
+         fprintf (stderr, "/symbeg");
+         break;
+
+       case symend:
+         fprintf (stderr, "/symend");
+         break;
  
         case syntaxspec:
-         printf ("/syntaxspec");
+         fprintf (stderr, "/syntaxspec");
           mcnt = *p++;
-         printf ("/%d", mcnt);
+         fprintf (stderr, "/%d", mcnt);
           break;
  
         case notsyntaxspec:
-         printf ("/notsyntaxspec");
+         fprintf (stderr, "/notsyntaxspec");
           mcnt = *p++;
-         printf ("/%d", mcnt);
+         fprintf (stderr, "/%d", mcnt);
           break;
  
  # ifdef emacs
         case before_dot:
-         printf ("/before_dot");
+         fprintf (stderr, "/before_dot");
           break;
  
         case at_dot:
-         printf ("/at_dot");
+         fprintf (stderr, "/at_dot");
           break;
  
         case after_dot:
-         printf ("/after_dot");
+         fprintf (stderr, "/after_dot");
           break;
  
         case categoryspec:
-         printf ("/categoryspec");
+         fprintf (stderr, "/categoryspec");
           mcnt = *p++;
-         printf ("/%d", mcnt);
+         fprintf (stderr, "/%d", mcnt);
           break;
  
         case notcategoryspec:
-         printf ("/notcategoryspec");
+         fprintf (stderr, "/notcategoryspec");
           mcnt = *p++;
-         printf ("/%d", mcnt);
+         fprintf (stderr, "/%d", mcnt);
           break;
  # endif /* emacs */
  
         case begbuf:
-         printf ("/begbuf");
+         fprintf (stderr, "/begbuf");
           break;
  
         case endbuf:
-         printf ("/endbuf");
+         fprintf (stderr, "/endbuf");
           break;
  
         default:
-         printf ("?%d", *(p-1));
+         fprintf (stderr, "?%d", *(p-1));
         }
  
-      putchar ('\n');
+      fprintf (stderr, "\n");
      }
  
-  printf ("%d:\tend of pattern.\n", p - start);
+  fprintf (stderr, "%d:\tend of pattern.\n", p - start);
  }
  
  
@@ -1241,7 +1251,7 @@ reg_syntax_t re_syntax_options;
  
  reg_syntax_t
  re_set_syntax (syntax)
-    reg_syntax_t syntax;
+     reg_syntax_t syntax;
  {
    reg_syntax_t ret = re_syntax_options;
  
@@ -1249,6 +1259,17 @@ re_set_syntax (syntax)
    return ret;
  }
  WEAK_ALIAS (__re_set_syntax, re_set_syntax)
+
+/* Regexp to use to replace spaces, or NULL meaning don't.  */
+static re_char *whitespace_regexp;
+
+void
+re_set_whitespace_regexp (regexp)
+     const char *regexp;
+{
+  whitespace_regexp = (re_char *) regexp;
+}
+WEAK_ALIAS (__re_set_syntax, re_set_syntax)
  \f
  /* This table gives an error message for each of the error codes listed
     in regex.h.  Obviously the order here has to be same as there.
@@ -1274,6 +1295,7 @@ static const char *re_error_msgid[] =
      gettext_noop ("Premature end of regular expression"), /* REG_EEND */
      gettext_noop ("Regular expression too big"), /* REG_ESIZE */
      gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
+    gettext_noop ("Range striding over charsets") /* REG_ERANGEX  */
    };
  \f
  /* Avoiding alloca during matching, to placate r_alloc.  */
@@ -1520,26 +1542,6 @@ do {                                                                     \
      }                                                                  \
  } while (0)
  
-/* Discard a saved register off the stack.  */
-#define DISCARD_FAILURE_REG_OR_COUNT()                                 \
-do {                                                                   \
-  int reg = POP_FAILURE_INT ();                                                \
-  if (reg == -1)                                                       \
-    {                                                                  \
-      /* It's a counter.  */                                           \
-      POP_FAILURE_POINTER ();                                          \
-      reg = POP_FAILURE_INT ();                                                \
-      DEBUG_PRINT3 ("     Discard counter %p = %d\n", ptr, reg);       \
-    }                                                                  \
-  else                                                                 \
-    {                                                                  \
-      POP_FAILURE_POINTER ();                                          \
-      POP_FAILURE_POINTER ();                                          \
-      DEBUG_PRINT4 ("     Discard reg %d (spanning %p -> %p)\n",       \
-                   reg, regstart[reg], regend[reg]);                   \
-    }                                                                  \
-} while (0)
-
  /* Check that we are not stuck in an infinite loop.  */
  #define CHECK_INFINITE_LOOP(pat_cur, string_place)                     \
  do {                                                                   \
@@ -1553,16 +1555,15 @@ do {                                                                    \
               && FAILURE_PAT (failure) <= bufp->buffer + bufp->used);   \
        if (FAILURE_PAT (failure) == pat_cur)                            \
         {                                                               \
-         while (fail_stack.frame < fail_stack.avail)                   \
-           DISCARD_FAILURE_REG_OR_COUNT ();                            \
-         goto fail;                                                    \
+         cycle = 1;                                                    \
+         break;                                                        \
         }                                                               \
        DEBUG_PRINT2 ("  Other pattern: %p\n", FAILURE_PAT (failure));   \
        failure = NEXT_FAILURE_HANDLE(failure);                          \
      }                                                                  \
    DEBUG_PRINT2 ("  Other string: %p\n", FAILURE_STR (failure));                \
  } while (0)
-    
+
  /* Push the information about the state we will need
     if we ever fail back to it.
  
@@ -1761,8 +1762,11 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
  
  
  /* This is not an arbitrary limit: the arguments which represent offsets
-   into the pattern are two bytes long.  So if 2^16 bytes turns out to
+   into the pattern are two bytes long.  So if 2^15 bytes turns out to
     be too small, many things would have to change.  */
+# define MAX_BUF_SIZE (1L << 15)
+
+#if 0  /* This is when we thought it could be 2^16 bytes.  */
  /* Any other compiler which, like MSC, has allocation limit below 2^16
     bytes will have to use approach similar to what was done below for
     MSC and drop MAX_BUF_SIZE a bit.  Otherwise you may end up
@@ -1774,6 +1778,7 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
  #else
  # define MAX_BUF_SIZE (1L << 16)
  #endif
+#endif /* 0 */
  
  /* Extend the buffer by twice its current size via realloc and
     reset the pointers that pointed into the old block to point to the
@@ -1946,64 +1951,32 @@ struct range_table_work_area
  
  /* Get the next unsigned number in the uncompiled pattern.  */
  #define GET_UNSIGNED_NUMBER(num)                                       \
- do { if (p != pend)                                                   \
-     {                                                                 \
-       PATFETCH (c);                                                   \
-       if (c == ' ')                                                   \
-        FREE_STACK_RETURN (REG_BADBR);                                 \
-       while ('0' <= c && c <= '9')                                    \
-        {                                                              \
-           int prev;                                                   \
-          if (num < 0)                                                 \
-            num = 0;                                                   \
-          prev = num;                                                  \
-          num = num * 10 + c - '0';                                    \
-          if (num / 10 != prev)                                        \
-            FREE_STACK_RETURN (REG_BADBR);                             \
-          if (p == pend)                                               \
-            break;                                                     \
-          PATFETCH (c);                                                \
-        }                                                              \
-       if (c == ' ')                                                   \
-        FREE_STACK_RETURN (REG_BADBR);                                 \
-       }                                                               \
-    } while (0)
+  do {                                                                 \
+    if (p == pend)                                                     \
+      FREE_STACK_RETURN (REG_EBRACE);                                  \
+    else                                                               \
+      {                                                                        \
+       PATFETCH (c);                                                   \
+       while ('0' <= c && c <= '9')                                    \
+         {                                                             \
+           int prev;                                                   \
+           if (num < 0)                                                \
+             num = 0;                                                  \
+           prev = num;                                                 \
+           num = num * 10 + c - '0';                                   \
+           if (num / 10 != prev)                                       \
+             FREE_STACK_RETURN (REG_BADBR);                            \
+           if (p == pend)                                              \
+             FREE_STACK_RETURN (REG_EBRACE);                           \
+           PATFETCH (c);                                               \
+         }                                                             \
+      }                                                                        \
+  } while (0)
  \f
-#if WIDE_CHAR_SUPPORT
-/* The GNU C library provides support for user-defined character classes
-   and the functions from ISO C amendement 1.  */
-# ifdef CHARCLASS_NAME_MAX
-#  define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX
-# else
-/* This shouldn't happen but some implementation might still have this
-   problem.  Use a reasonable default value.  */
-#  define CHAR_CLASS_MAX_LENGTH 256
-# endif
-typedef wctype_t re_wctype_t;
-typedef wchar_t re_wchar_t;
-# define re_wctype wctype
-# define re_iswctype iswctype
-# define re_wctype_to_bit(cc) 0
-#else
-# define CHAR_CLASS_MAX_LENGTH  9 /* Namely, `multibyte'.  */
-# define btowc(c) c
-
-/* Character classes.  */
-typedef enum { RECC_ERROR = 0,
-              RECC_ALNUM, RECC_ALPHA, RECC_WORD,
-              RECC_GRAPH, RECC_PRINT,
-              RECC_LOWER, RECC_UPPER,
-              RECC_PUNCT, RECC_CNTRL,
-              RECC_DIGIT, RECC_XDIGIT,
-              RECC_BLANK, RECC_SPACE,
-              RECC_MULTIBYTE, RECC_NONASCII,
-              RECC_ASCII, RECC_UNIBYTE
-} re_wctype_t;
-
-typedef int re_wchar_t;
+#if ! WIDE_CHAR_SUPPORT
  
  /* Map a string to the char class it names (if any).  */
-static re_wctype_t
+re_wctype_t
  re_wctype (str)
       re_char *str;
  {
@@ -2029,7 +2002,7 @@ re_wctype (str)
  }
  
  /* True iff CH is in the char class CC.  */
-static boolean
+boolean
  re_iswctype (ch, cc)
       int ch;
       re_wctype_t cc;
@@ -2089,7 +2062,7 @@ re_wctype_to_bit (cc)
  static void
  extend_range_table_work_area (work_area)
       struct range_table_work_area *work_area;
-{                                                                      
+{
    work_area->allocated += 16 * sizeof (int);
    if (work_area->table)
      work_area->table
@@ -2128,7 +2101,7 @@ set_image_of_range_1 (work_area, start, end, translate)
  
       `strange' indicates a character that has more than one
       case-equivalent.  */
-     
+
    enum case_type {one_case, two_case, strange};
  
    /* Describe the run that is in progress,
@@ -2198,7 +2171,7 @@ set_image_of_range_1 (work_area, start, end, translate)
             }
           run_type = strange;
         }
-             
+
        if (this_type == strange)
         {
           /* For a strange character, add each of its equivalents, one
@@ -2474,6 +2447,15 @@ regex_compile (pattern, size, syntax, bufp)
    /* If the object matched can contain multibyte characters.  */
    const boolean multibyte = RE_MULTIBYTE_P (bufp);
  
+  /* Nonzero if we have pushed down into a subpattern.  */
+  int in_subpattern = 0;
+
+  /* These hold the values of p, pattern, and pend from the main
+     pattern when we have pushed into a subpattern.  */
+  re_char *main_p;
+  re_char *main_pattern;
+  re_char *main_pend;
+
  #ifdef DEBUG
    debug++;
    DEBUG_PRINT1 ("\nCompiling pattern: ");
@@ -2536,12 +2518,61 @@ regex_compile (pattern, size, syntax, bufp)
    begalt = b = bufp->buffer;
  
    /* Loop through the uncompiled pattern until we're at the end.  */
-  while (p != pend)
+  while (1)
      {
+      if (p == pend)
+       {
+         /* If this is the end of an included regexp,
+            pop back to the main regexp and try again.  */
+         if (in_subpattern)
+           {
+             in_subpattern = 0;
+             pattern = main_pattern;
+             p = main_p;
+             pend = main_pend;
+             continue;
+           }
+         /* If this is the end of the main regexp, we are done.  */
+         break;
+       }
+
        PATFETCH (c);
  
        switch (c)
         {
+       case ' ':
+         {
+           re_char *p1 = p;
+
+           /* If there's no special whitespace regexp, treat
+              spaces normally.  And don't try to do this recursively.  */
+           if (!whitespace_regexp || in_subpattern)
+             goto normal_char;
+
+           /* Peek past following spaces.  */
+           while (p1 != pend)
+             {
+               if (*p1 != ' ')
+                 break;
+               p1++;
+             }
+           /* If the spaces are followed by a repetition op,
+              treat them normally.  */
+           if (p1 != pend
+               && (*p1 == '*' || *p1 == '+' || *p1 == '?'
+                   || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
+             goto normal_char;
+
+           /* Replace the spaces with the whitespace regexp.  */
+           in_subpattern = 1;
+           main_p = p1;
+           main_pend = pend;
+           main_pattern = pattern;
+           p = pattern = whitespace_regexp;
+           pend = p + strlen (p);
+           break;
+         }    
+
         case '^':
           {
             if (   /* If at start of pattern, it's an operator.  */
@@ -2645,10 +2676,10 @@ regex_compile (pattern, size, syntax, bufp)
                     unsigned int startoffset = 0;
                     re_opcode_t ofj =
                       /* Check if the loop can match the empty string.  */
-                     (simple || !analyse_first (laststart, b, NULL, 0)) ?
-                     on_failure_jump : on_failure_jump_loop;
+                     (simple || !analyse_first (laststart, b, NULL, 0))
+                     ? on_failure_jump : on_failure_jump_loop;
                     assert (skip_one_char (laststart) <= b);
-                   
+
                     if (!zero_times_ok && simple)
                       { /* Since simple * loops can be made faster by using
                            on_failure_keep_string_jump, we turn simple P+
@@ -2694,8 +2725,9 @@ regex_compile (pattern, size, syntax, bufp)
                   {
                     boolean emptyp = analyse_first (laststart, b, NULL, 0);
  
-                   /* The non-greedy multiple match looks like a repeat..until:
-                      we only need a conditional jump at the end of the loop */
+                   /* The non-greedy multiple match looks like
+                      a repeat..until: we only need a conditional jump
+                      at the end of the loop.  */
                     if (emptyp) BUF_PUSH (no_op);
                     STORE_JUMP (emptyp ? on_failure_jump_nastyloop
                                 : on_failure_jump, b, laststart);
@@ -2704,7 +2736,7 @@ regex_compile (pattern, size, syntax, bufp)
                       {
                         /* The repeat...until naturally matches one or more.
                            To also match zero times, we need to first jump to
-                          the end of the loop (its conditional jump). */
+                          the end of the loop (its conditional jump).  */
                         INSERT_JUMP (jump, laststart, b);
                         b += 3;
                       }
@@ -2909,7 +2941,7 @@ regex_compile (pattern, size, syntax, bufp)
                           }
                       }
                     else if (!SAME_CHARSET_P (c, c1))
-                     FREE_STACK_RETURN (REG_ERANGE);
+                     FREE_STACK_RETURN (REG_ERANGEX);
                   }
                 else
                   /* Range from C to C. */
@@ -3202,9 +3234,6 @@ regex_compile (pattern, size, syntax, bufp)
  
                 beg_interval = p;
  
-               if (p == pend)
-                 FREE_STACK_RETURN (REG_EBRACE);
-
                 GET_UNSIGNED_NUMBER (lower_bound);
  
                 if (c == ',')
@@ -3221,7 +3250,8 @@ regex_compile (pattern, size, syntax, bufp)
                   {
                     if (c != '\\')
                       FREE_STACK_RETURN (REG_BADBR);
-
+                   if (p == pend)
+                     FREE_STACK_RETURN (REG_EESCAPE);
                     PATFETCH (c);
                   }
  
@@ -3241,99 +3271,99 @@ regex_compile (pattern, size, syntax, bufp)
                       goto unfetch_interval;
                   }
  
-                if (upper_bound == 0)
-                  /* If the upper bound is zero, just drop the sub pattern
-                     altogether.  */
-                  b = laststart;
-                else if (lower_bound == 1 && upper_bound == 1)
-                  /* Just match it once: nothing to do here.  */
-                  ;
-
-                /* Otherwise, we have a nontrivial interval.  When
-                   we're all done, the pattern will look like:
-                     set_number_at <jump count> <upper bound>
-                     set_number_at <succeed_n count> <lower bound>
-                     succeed_n <after jump addr> <succeed_n count>
-                     <body of loop>
-                     jump_n <succeed_n addr> <jump count>
-                   (The upper bound and `jump_n' are omitted if
-                   `upper_bound' is 1, though.)  */
-                else
-                  { /* If the upper bound is > 1, we need to insert
-                       more at the end of the loop.  */
-                    unsigned int nbytes = (upper_bound < 0 ? 3
-                                           : upper_bound > 1 ? 5 : 0);
-                    unsigned int startoffset = 0;
-
-                    GET_BUFFER_SPACE (20); /* We might use less.  */
-
-                    if (lower_bound == 0)
-                      {
-                        /* A succeed_n that starts with 0 is really a
-                           a simple on_failure_jump_loop.  */
-                        INSERT_JUMP (on_failure_jump_loop, laststart,
-                                     b + 3 + nbytes);
-                        b += 3;
-                      }
-                    else
-                      {
-                        /* Initialize lower bound of the `succeed_n', even
-                           though it will be set during matching by its
-                           attendant `set_number_at' (inserted next),
-                           because `re_compile_fastmap' needs to know.
-                           Jump to the `jump_n' we might insert below.  */
-                        INSERT_JUMP2 (succeed_n, laststart,
-                                      b + 5 + nbytes,
-                                      lower_bound);
-                        b += 5;
-
-                        /* Code to initialize the lower bound.  Insert
-                           before the `succeed_n'.      The `5' is the last two
-                           bytes of this `set_number_at', plus 3 bytes of
-                           the following `succeed_n'.  */
-                        insert_op2 (set_number_at, laststart, 5, lower_bound, b);
-                        b += 5;
-                        startoffset += 5;
-                      }
-
-                    if (upper_bound < 0)
-                      {
-                        /* A negative upper bound stands for infinity,
-                           in which case it degenerates to a plain jump.  */
-                        STORE_JUMP (jump, b, laststart + startoffset);
-                        b += 3;
-                      }
-                    else if (upper_bound > 1)
-                      { /* More than one repetition is allowed, so
-                           append a backward jump to the `succeed_n'
-                           that starts this interval.
-
-                           When we've reached this during matching,
-                           we'll have matched the interval once, so
-                           jump back only `upper_bound - 1' times.  */
-                        STORE_JUMP2 (jump_n, b, laststart + startoffset,
-                                     upper_bound - 1);
-                        b += 5;
-
-                        /* The location we want to set is the second
-                           parameter of the `jump_n'; that is `b-2' as
-                           an absolute address.  `laststart' will be
-                           the `set_number_at' we're about to insert;
-                           `laststart+3' the number to set, the source
-                           for the relative address.  But we are
-                           inserting into the middle of the pattern --
-                           so everything is getting moved up by 5.
-                           Conclusion: (b - 2) - (laststart + 3) + 5,
-                           i.e., b - laststart.
-
-                           We insert this at the beginning of the loop
-                           so that if we fail during matching, we'll
-                           reinitialize the bounds.  */
-                        insert_op2 (set_number_at, laststart, b - laststart,
-                                    upper_bound - 1, b);
-                        b += 5;
-                      }
-                  }
+               if (upper_bound == 0)
+                 /* If the upper bound is zero, just drop the sub pattern
+                    altogether.  */
+                 b = laststart;
+               else if (lower_bound == 1 && upper_bound == 1)
+                 /* Just match it once: nothing to do here.  */
+                 ;
+
+               /* Otherwise, we have a nontrivial interval.  When
+                  we're all done, the pattern will look like:
+                  set_number_at <jump count> <upper bound>
+                  set_number_at <succeed_n count> <lower bound>
+                  succeed_n <after jump addr> <succeed_n count>
+                  <body of loop>
+                  jump_n <succeed_n addr> <jump count>
+                  (The upper bound and `jump_n' are omitted if
+                  `upper_bound' is 1, though.)  */
+               else
+                 { /* If the upper bound is > 1, we need to insert
+                      more at the end of the loop.  */
+                   unsigned int nbytes = (upper_bound < 0 ? 3
+                                          : upper_bound > 1 ? 5 : 0);
+                   unsigned int startoffset = 0;
+
+                   GET_BUFFER_SPACE (20); /* We might use less.  */
+
+                   if (lower_bound == 0)
+                     {
+                       /* A succeed_n that starts with 0 is really a
+                          a simple on_failure_jump_loop.  */
+                       INSERT_JUMP (on_failure_jump_loop, laststart,
+                                    b + 3 + nbytes);
+                       b += 3;
+                     }
+                   else
+                     {
+                       /* Initialize lower bound of the `succeed_n', even
+                          though it will be set during matching by its
+                          attendant `set_number_at' (inserted next),
+                          because `re_compile_fastmap' needs to know.
+                          Jump to the `jump_n' we might insert below.  */
+                       INSERT_JUMP2 (succeed_n, laststart,
+                                     b + 5 + nbytes,
+                                     lower_bound);
+                       b += 5;
+
+                       /* Code to initialize the lower bound.  Insert
+                          before the `succeed_n'.       The `5' is the last two
+                          bytes of this `set_number_at', plus 3 bytes of
+                          the following `succeed_n'.  */
+                       insert_op2 (set_number_at, laststart, 5, lower_bound, b);
+                       b += 5;
+                       startoffset += 5;
+                     }
+
+                   if (upper_bound < 0)
+                     {
+                       /* A negative upper bound stands for infinity,
+                          in which case it degenerates to a plain jump.  */
+                       STORE_JUMP (jump, b, laststart + startoffset);
+                       b += 3;
+                     }
+                   else if (upper_bound > 1)
+                     { /* More than one repetition is allowed, so
+                          append a backward jump to the `succeed_n'
+                          that starts this interval.
+
+                          When we've reached this during matching,
+                          we'll have matched the interval once, so
+                          jump back only `upper_bound - 1' times.  */
+                       STORE_JUMP2 (jump_n, b, laststart + startoffset,
+                                    upper_bound - 1);
+                       b += 5;
+
+                       /* The location we want to set is the second
+                          parameter of the `jump_n'; that is `b-2' as
+                          an absolute address.  `laststart' will be
+                          the `set_number_at' we're about to insert;
+                          `laststart+3' the number to set, the source
+                          for the relative address.  But we are
+                          inserting into the middle of the pattern --
+                          so everything is getting moved up by 5.
+                          Conclusion: (b - 2) - (laststart + 3) + 5,
+                          i.e., b - laststart.
+
+                          We insert this at the beginning of the loop
+                          so that if we fail during matching, we'll
+                          reinitialize the bounds.  */
+                       insert_op2 (set_number_at, laststart, b - laststart,
+                                   upper_bound - 1, b);
+                       b += 5;
+                     }
+                 }
                 pending_exact = 0;
                 beg_interval = NULL;
               }
@@ -3417,6 +3447,19 @@ regex_compile (pattern, size, syntax, bufp)
               BUF_PUSH (wordend);
               break;
  
+           case '_':
+             if (syntax & RE_NO_GNU_OPS)
+               goto normal_char;
+              laststart = b;
+              PATFETCH (c);
+              if (c == '<')
+                BUF_PUSH (symbeg);
+              else if (c == '>')
+                BUF_PUSH (symend);
+              else
+                FREE_STACK_RETURN (REG_BADPAT);
+              break;
+
             case 'b':
               if (syntax & RE_NO_GNU_OPS)
                 goto normal_char;
@@ -3538,8 +3581,6 @@ regex_compile (pattern, size, syntax, bufp)
    if (syntax & RE_NO_POSIX_BACKTRACKING)
      BUF_PUSH (succeed);
  
-  free (compile_stack.stack);
-
    /* We have succeeded; set the length of the buffer.  */
    bufp->used = b - bufp->buffer;
  
@@ -3579,7 +3620,7 @@ regex_compile (pattern, size, syntax, bufp)
    }
  #endif /* not MATCH_MAY_ALLOCATE */
  
-  return REG_NOERROR;
+  FREE_STACK_RETURN (REG_NOERROR);
  } /* regex_compile */
  \f
  /* Subroutines for `regex_compile'.  */
@@ -3911,6 +3952,8 @@ analyse_first (p, pend, fastmap, multibyte)
         case notwordbound:
         case wordbeg:
         case wordend:
+       case symbeg:
+       case symend:
           continue;
  
  
@@ -3964,7 +4007,7 @@ analyse_first (p, pend, fastmap, multibyte)
              case has already been handled, so we only need to look at the
              fallthrough case.  */
           continue;
-         
+
         case succeed_n:
           /* If N == 0, it should be an on_failure_jump_loop instead.  */
           DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
@@ -4089,6 +4132,10 @@ re_search (bufp, string, size, startpos, range, regs)
  }
  WEAK_ALIAS (__re_search, re_search)
  
+/* Head address of virtual concatenation of string.  */
+#define HEAD_ADDR_VSTRING(P)           \
+  (((P) >= size1 ? string2 : string1))
+
  /* End address of virtual concatenation of string.  */
  #define STOP_ADDR_VSTRING(P)                           \
    (((P) >= size1 ? string2 + size2 : string1 + size1))
@@ -4244,12 +4291,19 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
                         d += buf_charlen;
                       }
                   else
-                   while (range > lim
-                          && !fastmap[RE_TRANSLATE (translate, *d)])
-                     {
-                       d++;
-                       range--;
-                     }
+                   {
+                     /* Convert *d to integer to shut up GCC's
+                        whining about comparison that is always
+                        true.  */
+                     int di = *d;
+
+                     while (range > lim
+                            && !fastmap[RE_TRANSLATE (translate, di)])
+                       {
+                         di = *(++d);
+                         range--;
+                       }
+                   }
                 }
               else
                 while (range > lim && !fastmap[*d])
@@ -4324,26 +4378,17 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
           /* Update STARTPOS to the previous character boundary.  */
           if (multibyte)
             {
-             re_char *p = POS_ADDR_VSTRING (startpos);
-             int len = 0;
+             re_char *p = POS_ADDR_VSTRING (startpos) + 1;
+             re_char *p0 = p;
+             re_char *phead = HEAD_ADDR_VSTRING (startpos);
  
               /* Find the head of multibyte form.  */
-             while (!CHAR_HEAD_P (*p))
-               p--, len++;
-
-             /* Adjust it. */
-#if 0                          /* XXX */
-             if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
-               ;
-             else
-#endif
-               {
-                 range += len;
-                 if (range > 0)
-                   break;
+             PREV_CHAR_BOUNDARY (p, phead);
+             range += p0 - 1 - p;
+             if (range > 0)
+               break;
  
-                 startpos -= len;
-               }
+             startpos -= p0 - 1 - p;
             }
         }
      }
@@ -4452,7 +4497,7 @@ skip_one_char (p)
      {
      case anychar:
        break;
-      
+
      case exactn:
        p += *p + 1;
        break;
@@ -4469,7 +4514,7 @@ skip_one_char (p)
        else
         p += 1 + CHARSET_BITMAP_SIZE (p - 1);
        break;
-      
+
      case syntaxspec:
      case notsyntaxspec:
  #ifdef emacs
@@ -4487,9 +4532,9 @@ skip_one_char (p)
  
  
  /* Jump over non-matching operations.  */
-static unsigned char *
+static re_char *
  skip_noops (p, pend)
-     unsigned char *p, *pend;
+     re_char *p, *pend;
  {
    int mcnt;
    while (p < pend)
@@ -4518,7 +4563,7 @@ skip_noops (p, pend)
  static int
  mutually_exclusive_p (bufp, p1, p2)
       struct re_pattern_buffer *bufp;
-     unsigned char *p1, *p2;
+     re_char *p1, *p2;
  {
    re_opcode_t op2;
    const boolean multibyte = RE_MULTIBYTE_P (bufp);
@@ -4552,7 +4597,7 @@ mutually_exclusive_p (bufp, p1, p2)
           return 1;
         }
        break;
-      
+
      case endline:
      case exactn:
        {
@@ -4662,7 +4707,7 @@ mutually_exclusive_p (bufp, p1, p2)
           }
        }
        break;
-      
+
      case charset_not:
        switch (SWITCH_ENUM_CAST (*p1))
         {
@@ -4680,14 +4725,20 @@ mutually_exclusive_p (bufp, p1, p2)
        break;
  
      case wordend:
-    case notsyntaxspec:
+      return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
+    case symend:
        return ((re_opcode_t) *p1 == syntaxspec
-             && p1[1] == (op2 == wordend ? Sword : p2[1]));
+              && (p1[1] == Ssymbol || p1[1] == Sword));
+    case notsyntaxspec:
+      return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
  
      case wordbeg:
-    case syntaxspec:
+      return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
+    case symbeg:
        return ((re_opcode_t) *p1 == notsyntaxspec
-             && p1[1] == (op2 == wordend ? Sword : p2[1]));
+              && (p1[1] == Ssymbol || p1[1] == Sword));
+    case syntaxspec:
+      return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
  
      case wordbound:
        return (((re_opcode_t) *p1 == notsyntaxspec
@@ -5201,8 +5252,13 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
               else
                 do
                   {
+                   /* Avoid compiler whining about comparison being
+                      always true.  */
+                   int di;
+
                     PREFETCH ();
-                   if (RE_TRANSLATE (translate, *d) != *p++)
+                   di = *d;
+                   if (RE_TRANSLATE (translate, di) != *p++)
                       {
                         d = dfail;
                         goto fail;
@@ -5346,7 +5402,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
  
           assert (!REG_UNSET (regstart[*p]));
           /* Strictly speaking, there should be code such as:
-            
+
                 assert (REG_UNSET (regend[*p]));
                 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
  
@@ -5518,7 +5574,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
              cycle detection cannot work.  Worse yet, such a detection
              can not only fail to detect a cycle, but it can also wrongly
              detect a cycle (between different instantiations of the same
-            loop.
+            loop).
              So the method used for those nasty loops is a little different:
              We use a special cycle-detection-stack-frame which is pushed
              when the on_failure_jump_nastyloop failure-point is *popped*.
@@ -5532,11 +5588,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
                         mcnt, p + mcnt);
  
           assert ((re_opcode_t)p[-4] == no_op);
-         CHECK_INFINITE_LOOP (p - 4, d);
-         PUSH_FAILURE_POINT (p - 3, d);
+         {
+           int cycle = 0;
+           CHECK_INFINITE_LOOP (p - 4, d);
+           if (!cycle)
+             /* If there's a cycle, just continue without pushing
+                this failure point.  The failure point is the "try again"
+                option, which shouldn't be tried.
+                We want (x?)*?y\1z to match both xxyz and xxyxz.  */
+             PUSH_FAILURE_POINT (p - 3, d);
+         }
           break;
  
-
           /* Simple loop detecting on_failure_jump:  just check on the
              failure stack if the same spot was already hit earlier.  */
         case on_failure_jump_loop:
@@ -5544,9 +5607,19 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
           EXTRACT_NUMBER_AND_INCR (mcnt, p);
           DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
                         mcnt, p + mcnt);
-
-         CHECK_INFINITE_LOOP (p - 3, d);
-         PUSH_FAILURE_POINT (p - 3, d);
+         {
+           int cycle = 0;
+           CHECK_INFINITE_LOOP (p - 3, d);
+           if (cycle)
+             /* If there's a cycle, get out of the loop, as if the matching
+                had failed.  We used to just `goto fail' here, but that was
+                aborting the search a bit too early: we want to keep the
+                empty-loop-match and keep matching after the loop.
+                We want (x?)*y\1z to match both xxyz and xxyxz.  */
+             p += mcnt;
+           else
+             PUSH_FAILURE_POINT (p - 3, d);
+         }
           break;
  
  
@@ -5746,7 +5819,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
               PREFETCH ();
               c2 = RE_STRING_CHAR (d, dend - d);
               s2 = SYNTAX (c2);
-       
+
               /* Case 2: S2 is not Sword. */
               if (s2 != Sword)
                 goto fail;
@@ -5800,7 +5873,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
                   PREFETCH_NOLIMIT ();
                   c2 = RE_STRING_CHAR (d, dend - d);
  #ifdef emacs
-                 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
+                 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
  #endif
                   s2 = SYNTAX (c2);
  
@@ -5812,6 +5885,92 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
             }
           break;
  
+       case symbeg:
+         DEBUG_PRINT1 ("EXECUTING symbeg.\n");
+
+         /* We FAIL in one of the following cases: */
+
+         /* Case 1: D is at the end of string.  */
+         if (AT_STRINGS_END (d))
+           goto fail;
+         else
+           {
+             /* C1 is the character before D, S1 is the syntax of C1, C2
+                is the character at D, and S2 is the syntax of C2.  */
+             re_wchar_t c1, c2;
+             int s1, s2;
+#ifdef emacs
+             int offset = PTR_TO_OFFSET (d);
+             int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
+             UPDATE_SYNTAX_TABLE (charpos);
+#endif
+             PREFETCH ();
+             c2 = RE_STRING_CHAR (d, dend - d);
+             s2 = SYNTAX (c2);
+       
+             /* Case 2: S2 is neither Sword nor Ssymbol. */
+             if (s2 != Sword && s2 != Ssymbol)
+               goto fail;
+
+             /* Case 3: D is not at the beginning of string ... */
+             if (!AT_STRINGS_BEG (d))
+               {
+                 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+#ifdef emacs
+                 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
+#endif
+                 s1 = SYNTAX (c1);
+
+                 /* ... and S1 is Sword or Ssymbol.  */
+                 if (s1 == Sword || s1 == Ssymbol)
+                   goto fail;
+               }
+           }
+         break;
+
+       case symend:
+         DEBUG_PRINT1 ("EXECUTING symend.\n");
+
+         /* We FAIL in one of the following cases: */
+
+         /* Case 1: D is at the beginning of string.  */
+         if (AT_STRINGS_BEG (d))
+           goto fail;
+         else
+           {
+             /* C1 is the character before D, S1 is the syntax of C1, C2
+                is the character at D, and S2 is the syntax of C2.  */
+             re_wchar_t c1, c2;
+             int s1, s2;
+#ifdef emacs
+             int offset = PTR_TO_OFFSET (d) - 1;
+             int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
+             UPDATE_SYNTAX_TABLE (charpos);
+#endif
+             GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
+             s1 = SYNTAX (c1);
+
+             /* Case 2: S1 is neither Ssymbol nor Sword.  */
+             if (s1 != Sword && s1 != Ssymbol)
+               goto fail;
+
+             /* Case 3: D is not at the end of string ... */
+             if (!AT_STRINGS_END (d))
+               {
+                 PREFETCH_NOLIMIT ();
+                 c2 = RE_STRING_CHAR (d, dend - d);
+#ifdef emacs
+                 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+#endif
+                 s2 = SYNTAX (c2);
+
+                 /* ... and S2 is Sword or Ssymbol.  */
+                 if (s2 == Sword || s2 == Ssymbol)
+                    goto fail;
+               }
+           }
+         break;
+
         case syntaxspec:
         case notsyntaxspec:
           not = (re_opcode_t) *(p - 1) == notsyntaxspec;
@@ -6207,7 +6366,7 @@ regexec (preg, string, nmatch, pmatch, eflags)
      const regex_t *__restrict preg;
      const char *__restrict string;
      size_t nmatch;
-    regmatch_t pmatch[];
+    regmatch_t pmatch[__restrict_arr];
      int eflags;
  {
    int ret;
@@ -6339,3 +6498,6 @@ regfree (preg)
  WEAK_ALIAS (__regfree, regfree)
  
  #endif /* not emacs  */
+
+/* arch-tag: 4ffd68ba-2a9e-435b-a21a-018990f9eeb2
+   (do not change this comment) */