]> code.delx.au - gnu-emacs/blobdiff - src/regex.c
Typo
[gnu-emacs] / src / regex.c
index 37436fffba0fe3e5feb0670114b101149ea619c7..43351b380de63a65d4f55bb7ca72524425d05093 100644 (file)
@@ -19,7 +19,9 @@
    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
    USA.         */
 
-/* TODO:
+/* BUGS:
+   - (x?)*y\1z should match both xxxxyxz and xxxyz.
+   TODO:
    - structure the opcode space into opcode+flag.
    - merge with glibc's regex.[ch].
    - replace (succeed_n + jump_n + set_number_at) with something that doesn't
@@ -1518,19 +1520,43 @@ do {                                                                    \
     }                                                                  \
 } while (0)
 
+/* Discard a saved register off the stack.  */
+#define DISCARD_FAILURE_REG_OR_COUNT()                                 \
+do {                                                                   \
+  int reg = POP_FAILURE_INT ();                                                \
+  if (reg == -1)                                                       \
+    {                                                                  \
+      /* It's a counter.  */                                           \
+      POP_FAILURE_POINTER ();                                          \
+      reg = POP_FAILURE_INT ();                                                \
+      DEBUG_PRINT3 ("     Discard counter %p = %d\n", ptr, reg);       \
+    }                                                                  \
+  else                                                                 \
+    {                                                                  \
+      POP_FAILURE_POINTER ();                                          \
+      POP_FAILURE_POINTER ();                                          \
+      DEBUG_PRINT4 ("     Discard reg %d (spanning %p -> %p)\n",       \
+                   reg, regstart[reg], regend[reg]);                   \
+    }                                                                  \
+} while (0)
+
 /* Check that we are not stuck in an infinite loop.  */
 #define CHECK_INFINITE_LOOP(pat_cur, string_place)                     \
 do {                                                                   \
-  int failure = TOP_FAILURE_HANDLE();                                  \
+  int failure = TOP_FAILURE_HANDLE ();                                 \
   /* Check for infinite matching loops */                              \
-  while (failure > 0 &&                                                        \
-        (FAILURE_STR (failure) == string_place                         \
-         || FAILURE_STR (failure) == NULL))                            \
+  while (failure > 0                                                   \
+        && (FAILURE_STR (failure) == string_place                      \
+            || FAILURE_STR (failure) == NULL))                         \
     {                                                                  \
       assert (FAILURE_PAT (failure) >= bufp->buffer                    \
              && FAILURE_PAT (failure) <= bufp->buffer + bufp->used);   \
       if (FAILURE_PAT (failure) == pat_cur)                            \
-       goto fail;                                                      \
+       {                                                               \
+         while (fail_stack.frame < fail_stack.avail)                   \
+           DISCARD_FAILURE_REG_OR_COUNT ();                            \
+         goto fail;                                                    \
+       }                                                               \
       DEBUG_PRINT2 ("  Other pattern: %p\n", FAILURE_PAT (failure));   \
       failure = NEXT_FAILURE_HANDLE(failure);                          \
     }                                                                  \
@@ -1658,17 +1684,9 @@ static re_char *skip_one_char _RE_ARGS ((re_char *p));
 static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
                                    char *fastmap, const int multibyte));
 
-/* Fetch the next character in the uncompiled pattern---translating it
-   if necessary.  */
-#define PATFETCH(c)                                                    \
-  do {                                                                 \
-    PATFETCH_RAW (c);                                                  \
-    c = TRANSLATE (c);                                                 \
-  } while (0)
-
 /* Fetch the next character in the uncompiled pattern, with no
    translation.  */
-#define PATFETCH_RAW(c)                                                        \
+#define PATFETCH(c)                                                    \
   do {                                                                 \
     int len;                                                           \
     if (p == pend) return REG_EEND;                                    \
@@ -1816,7 +1834,7 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
 
 /* But patterns can have more than `MAX_REGNUM' registers.  We just
    ignore the excess.  */
-typedef unsigned regnum_t;
+typedef int regnum_t;
 
 
 /* Macros for the compile stack.  */
@@ -1851,7 +1869,17 @@ typedef struct
 /* The next available element.  */
 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
 
-
+/* Explicit quit checking is only used on NTemacs.  */
+#if defined WINDOWSNT && defined emacs && defined QUIT
+extern int immediate_quit;
+# define IMMEDIATE_QUIT_CHECK                  \
+    do {                                       \
+      if (immediate_quit) QUIT;                        \
+    } while (0)
+#else
+# define IMMEDIATE_QUIT_CHECK    ((void)0)
+#endif
+\f
 /* Structure to manage work area for range table.  */
 struct range_table_work_area
 {
@@ -1861,21 +1889,19 @@ struct range_table_work_area
   int bits;                    /* flag to record character classes */
 };
 
-/* Make sure that WORK_AREA can hold more N multibyte characters.  */
-#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n)                       \
-  do {                                                                   \
-    if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated)  \
-      {                                                                          \
-       (work_area).allocated += 16 * sizeof (int);                       \
-       if ((work_area).table)                                            \
-         (work_area).table                                               \
-           = (int *) realloc ((work_area).table, (work_area).allocated); \
-       else                                                              \
-         (work_area).table                                               \
-           = (int *) malloc ((work_area).allocated);                     \
-       if ((work_area).table == 0)                                       \
-         FREE_STACK_RETURN (REG_ESPACE);                                 \
-      }                                                                          \
+/* Make sure that WORK_AREA can hold more N multibyte characters.
+   This is used only in set_image_of_range and set_image_of_range_1.
+   It expects WORK_AREA to be a pointer.
+   If it can't get the space, it returns from the surrounding function.  */
+
+#define EXTEND_RANGE_TABLE(work_area, n)                               \
+  do {                                                                 \
+    if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
+      {                                                                        \
+        extend_range_table_work_area (work_area);                      \
+        if ((work_area)->table == 0)                                   \
+          return (REG_ESPACE);                                         \
+      }                                                                        \
   } while (0)
 
 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit)          \
@@ -1890,12 +1916,15 @@ struct range_table_work_area
 #define BIT_UPPER      0x10
 #define BIT_MULTIBYTE  0x20
 
-/* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
-#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)   \
+/* Set a range START..END to WORK_AREA.
+   The range is passed through TRANSLATE, so START and END
+   should be untranslated.  */
+#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end)               \
   do {                                                                 \
-    EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2);                     \
-    (work_area).table[(work_area).used++] = (range_start);             \
-    (work_area).table[(work_area).used++] = (range_end);               \
+    int tem;                                                           \
+    tem = set_image_of_range (&work_area, start, end, translate);      \
+    if (tem > 0)                                                       \
+      FREE_STACK_RETURN (tem);                                         \
   } while (0)
 
 /* Free allocated memory for WORK_AREA.         */
@@ -1909,7 +1938,7 @@ struct range_table_work_area
 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
-
+\f
 
 /* Set the bit for character C in a list.  */
 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
@@ -1920,18 +1949,26 @@ struct range_table_work_area
  do { if (p != pend)                                                   \
      {                                                                 \
        PATFETCH (c);                                                   \
+       if (c == ' ')                                                   \
+        FREE_STACK_RETURN (REG_BADBR);                                 \
        while ('0' <= c && c <= '9')                                    \
         {                                                              \
+           int prev;                                                   \
           if (num < 0)                                                 \
-             num = 0;                                                  \
+            num = 0;                                                   \
+          prev = num;                                                  \
           num = num * 10 + c - '0';                                    \
+          if (num / 10 != prev)                                        \
+            FREE_STACK_RETURN (REG_BADBR);                             \
           if (p == pend)                                               \
-             break;                                                    \
+            break;                                                     \
           PATFETCH (c);                                                \
         }                                                              \
+       if (c == ' ')                                                   \
+        FREE_STACK_RETURN (REG_BADBR);                                 \
        }                                                               \
     } while (0)
-
+\f
 #if WIDE_CHAR_SUPPORT
 /* The GNU C library provides support for user-defined character classes
    and the functions from ISO C amendement 1.  */
@@ -2044,17 +2081,256 @@ re_wctype_to_bit (cc)
     }
 }
 #endif
+\f
+/* Filling in the work area of a range.  */
 
-/* Explicit quit checking is only used on NTemacs.  */
-#if defined WINDOWSNT && defined emacs && defined QUIT
-extern int immediate_quit;
-# define IMMEDIATE_QUIT_CHECK                  \
-    do {                                       \
-      if (immediate_quit) QUIT;                        \
-    } while (0)
-#else
-# define IMMEDIATE_QUIT_CHECK    ((void)0)
+/* Actually extend the space in WORK_AREA.  */
+
+static void
+extend_range_table_work_area (work_area)
+     struct range_table_work_area *work_area;
+{                                                                      
+  work_area->allocated += 16 * sizeof (int);
+  if (work_area->table)
+    work_area->table
+      = (int *) realloc (work_area->table, work_area->allocated);
+  else
+    work_area->table
+      = (int *) malloc (work_area->allocated);
+}
+
+#ifdef emacs
+
+/* Carefully find the ranges of codes that are equivalent
+   under case conversion to the range start..end when passed through
+   TRANSLATE.  Handle the case where non-letters can come in between
+   two upper-case letters (which happens in Latin-1).
+   Also handle the case of groups of more than 2 case-equivalent chars.
+
+   The basic method is to look at consecutive characters and see
+   if they can form a run that can be handled as one.
+
+   Returns -1 if successful, REG_ESPACE if ran out of space.  */
+
+static int
+set_image_of_range_1 (work_area, start, end, translate)
+     RE_TRANSLATE_TYPE translate;
+     struct range_table_work_area *work_area;
+     re_wchar_t start, end;
+{
+  /* `one_case' indicates a character, or a run of characters,
+     each of which is an isolate (no case-equivalents).
+     This includes all ASCII non-letters.
+
+     `two_case' indicates a character, or a run of characters,
+     each of which has two case-equivalent forms.
+     This includes all ASCII letters.
+
+     `strange' indicates a character that has more than one
+     case-equivalent.  */
+     
+  enum case_type {one_case, two_case, strange};
+
+  /* Describe the run that is in progress,
+     which the next character can try to extend.
+     If run_type is strange, that means there really is no run.
+     If run_type is one_case, then run_start...run_end is the run.
+     If run_type is two_case, then the run is run_start...run_end,
+     and the case-equivalents end at run_eqv_end.  */
+
+  enum case_type run_type = strange;
+  int run_start, run_end, run_eqv_end;
+
+  Lisp_Object eqv_table;
+
+  if (!RE_TRANSLATE_P (translate))
+    {
+      EXTEND_RANGE_TABLE (work_area, 2);
+      work_area->table[work_area->used++] = (start);
+      work_area->table[work_area->used++] = (end);
+      return -1;
+    }
+
+  eqv_table = XCHAR_TABLE (translate)->extras[2];
+
+  for (; start <= end; start++)
+    {
+      enum case_type this_type;
+      int eqv = RE_TRANSLATE (eqv_table, start);
+      int minchar, maxchar;
+
+      /* Classify this character */
+      if (eqv == start)
+       this_type = one_case;
+      else if (RE_TRANSLATE (eqv_table, eqv) == start)
+       this_type = two_case;
+      else
+       this_type = strange;
+
+      if (start < eqv)
+       minchar = start, maxchar = eqv;
+      else
+       minchar = eqv, maxchar = start;
+
+      /* Can this character extend the run in progress?  */
+      if (this_type == strange || this_type != run_type
+         || !(minchar == run_end + 1
+              && (run_type == two_case
+                  ? maxchar == run_eqv_end + 1 : 1)))
+       {
+         /* No, end the run.
+            Record each of its equivalent ranges.  */
+         if (run_type == one_case)
+           {
+             EXTEND_RANGE_TABLE (work_area, 2);
+             work_area->table[work_area->used++] = run_start;
+             work_area->table[work_area->used++] = run_end;
+           }
+         else if (run_type == two_case)
+           {
+             EXTEND_RANGE_TABLE (work_area, 4);
+             work_area->table[work_area->used++] = run_start;
+             work_area->table[work_area->used++] = run_end;
+             work_area->table[work_area->used++]
+               = RE_TRANSLATE (eqv_table, run_start);
+             work_area->table[work_area->used++]
+               = RE_TRANSLATE (eqv_table, run_end);
+           }
+         run_type = strange;
+       }
+             
+      if (this_type == strange)
+       {
+         /* For a strange character, add each of its equivalents, one
+            by one.  Don't start a range.  */
+         do
+           {
+             EXTEND_RANGE_TABLE (work_area, 2);
+             work_area->table[work_area->used++] = eqv;
+             work_area->table[work_area->used++] = eqv;
+             eqv = RE_TRANSLATE (eqv_table, eqv);
+           }
+         while (eqv != start);
+       }
+
+      /* Add this char to the run, or start a new run.  */
+      else if (run_type == strange)
+       {
+         /* Initialize a new range.  */
+         run_type = this_type;
+         run_start = start;
+         run_end = start;
+         run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
+       }
+      else
+       {
+         /* Extend a running range.  */
+         run_end = minchar;
+         run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
+       }
+    }
+
+  /* If a run is still in progress at the end, finish it now
+     by recording its equivalent ranges.  */
+  if (run_type == one_case)
+    {
+      EXTEND_RANGE_TABLE (work_area, 2);
+      work_area->table[work_area->used++] = run_start;
+      work_area->table[work_area->used++] = run_end;
+    }
+  else if (run_type == two_case)
+    {
+      EXTEND_RANGE_TABLE (work_area, 4);
+      work_area->table[work_area->used++] = run_start;
+      work_area->table[work_area->used++] = run_end;
+      work_area->table[work_area->used++]
+       = RE_TRANSLATE (eqv_table, run_start);
+      work_area->table[work_area->used++]
+       = RE_TRANSLATE (eqv_table, run_end);
+    }
+
+  return -1;
+}
+
+#endif /* emacs */
+
+/* Record the the image of the range start..end when passed through
+   TRANSLATE.  This is not necessarily TRANSLATE(start)..TRANSLATE(end)
+   and is not even necessarily contiguous.
+   Normally we approximate it with the smallest contiguous range that contains
+   all the chars we need.  However, for Latin-1 we go to extra effort
+   to do a better job.
+
+   This function is not called for ASCII ranges.
+
+   Returns -1 if successful, REG_ESPACE if ran out of space.  */
+
+static int
+set_image_of_range (work_area, start, end, translate)
+     RE_TRANSLATE_TYPE translate;
+     struct range_table_work_area *work_area;
+     re_wchar_t start, end;
+{
+  re_wchar_t cmin, cmax;
+
+#ifdef emacs
+  /* For Latin-1 ranges, use set_image_of_range_1
+     to get proper handling of ranges that include letters and nonletters.
+     For a range that includes the whole of Latin-1, this is not necessary.
+     For other character sets, we don't bother to get this right.  */
+  if (RE_TRANSLATE_P (translate) && start < 04400
+      && !(start < 04200 && end >= 04377))
+    {
+      int newend;
+      int tem;
+      newend = end;
+      if (newend > 04377)
+       newend = 04377;
+      tem = set_image_of_range_1 (work_area, start, newend, translate);
+      if (tem > 0)
+       return tem;
+
+      start = 04400;
+      if (end < 04400)
+       return -1;
+    }
 #endif
+
+  EXTEND_RANGE_TABLE (work_area, 2);
+  work_area->table[work_area->used++] = (start);
+  work_area->table[work_area->used++] = (end);
+
+  cmin = -1, cmax = -1;
+
+  if (RE_TRANSLATE_P (translate))
+    {
+      int ch;
+
+      for (ch = start; ch <= end; ch++)
+       {
+         re_wchar_t c = TRANSLATE (ch);
+         if (! (start <= c && c <= end))
+           {
+             if (cmin == -1)
+               cmin = c, cmax = c;
+             else
+               {
+                 cmin = MIN (cmin, c);
+                 cmax = MAX (cmax, c);
+               }
+           }
+       }
+
+      if (cmin != -1)
+       {
+         EXTEND_RANGE_TABLE (work_area, 2);
+         work_area->table[work_area->used++] = (cmin);
+         work_area->table[work_area->used++] = (cmax);
+       }
+    }
+
+  return -1;
+}
 \f
 #ifndef MATCH_MAY_ALLOCATE
 
@@ -2493,6 +2769,10 @@ regex_compile (pattern, size, syntax, bufp)
 
                if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
+               /* Don't translate yet.  The range TRANSLATE(X..Y) cannot
+                  always be determined from TRANSLATE(X) and TRANSLATE(Y)
+                  So the translation is done later in a loop.  Example:
+                  (let ((case-fold-search t)) (string-match "[A-_]" "A"))  */
                PATFETCH (c);
 
                /* \ might escape characters inside [...] and [^...].  */
@@ -2552,7 +2832,7 @@ regex_compile (pattern, size, syntax, bufp)
                       them).  */
                    if (c == ':' && *p == ']')
                      {
-                       int ch;
+                       re_wchar_t ch;
                        re_wctype_t cc;
 
                        cc = re_wctype (str);
@@ -2621,8 +2901,8 @@ regex_compile (pattern, size, syntax, bufp)
                               starting at the smallest character in
                               the charset of C1 and ending at C1.  */
                            int charset = CHAR_CHARSET (c1);
-                           int c2 = MAKE_CHAR (charset, 0, 0);
-                           
+                           re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
+
                            SET_RANGE_TABLE_WORK_AREA (range_table_work,
                                                       c2, c1);
                            c1 = 0377;
@@ -2640,7 +2920,7 @@ regex_compile (pattern, size, syntax, bufp)
                  /* ... into bitmap.  */
                  {
                    re_wchar_t this_char;
-                   int range_start = c, range_end = c1;
+                   re_wchar_t range_start = c, range_end = c1;
 
                    /* If the start is after the end, the range is empty.  */
                    if (range_start > range_end)
@@ -2737,7 +3017,7 @@ regex_compile (pattern, size, syntax, bufp)
          /* Do not translate the character after the \, so that we can
             distinguish, e.g., \B from \b, even if we normally would
             translate, e.g., B to b.  */
-         PATFETCH_RAW (c);
+         PATFETCH (c);
 
          switch (c)
            {
@@ -3097,13 +3377,13 @@ regex_compile (pattern, size, syntax, bufp)
 
            case 'c':
              laststart = b;
-             PATFETCH_RAW (c);
+             PATFETCH (c);
              BUF_PUSH_2 (categoryspec, c);
              break;
 
            case 'C':
              laststart = b;
-             PATFETCH_RAW (c);
+             PATFETCH (c);
              BUF_PUSH_2 (notcategoryspec, c);
              break;
 #endif /* emacs */
@@ -3193,7 +3473,6 @@ regex_compile (pattern, size, syntax, bufp)
              /* You might think it would be useful for \ to mean
                 not to translate; but if we don't translate it
                 it will never match anything.  */
-             c = TRANSLATE (c);
              goto normal_char;
            }
          break;
@@ -3202,7 +3481,7 @@ regex_compile (pattern, size, syntax, bufp)
        default:
        /* Expects the character in `c'.  */
        normal_char:
-             /* If no exactn currently being built.  */
+         /* If no exactn currently being built.  */
          if (!pending_exact
 
              /* If last exactn not at current position.  */
@@ -3233,6 +3512,7 @@ regex_compile (pattern, size, syntax, bufp)
          {
            int len;
 
+           c = TRANSLATE (c);
            if (multibyte)
              len = CHAR_STRING (c, b);
            else
@@ -4395,7 +4675,7 @@ mutually_exclusive_p (bufp, p1, p2)
             they don't overlap.  The union of the two sets of excluded
             chars should cover all possible chars, which, as a matter of
             fact, is virtually impossible in multibyte buffers.  */
-         ;
+         break;
        }
       break;