Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA. */
-/* TODO:
+/* BUGS:
+ - (x?)*y\1z should match both xxxxyxz and xxxyz.
+ TODO:
- structure the opcode space into opcode+flag.
- merge with glibc's regex.[ch].
- replace (succeed_n + jump_n + set_number_at) with something that doesn't
} \
} while (0)
+/* Discard a saved register off the stack. */
+#define DISCARD_FAILURE_REG_OR_COUNT() \
+do { \
+ int reg = POP_FAILURE_INT (); \
+ if (reg == -1) \
+ { \
+ /* It's a counter. */ \
+ POP_FAILURE_POINTER (); \
+ reg = POP_FAILURE_INT (); \
+ DEBUG_PRINT3 (" Discard counter %p = %d\n", ptr, reg); \
+ } \
+ else \
+ { \
+ POP_FAILURE_POINTER (); \
+ POP_FAILURE_POINTER (); \
+ DEBUG_PRINT4 (" Discard reg %d (spanning %p -> %p)\n", \
+ reg, regstart[reg], regend[reg]); \
+ } \
+} while (0)
+
/* Check that we are not stuck in an infinite loop. */
#define CHECK_INFINITE_LOOP(pat_cur, string_place) \
do { \
- int failure = TOP_FAILURE_HANDLE(); \
+ int failure = TOP_FAILURE_HANDLE (); \
/* Check for infinite matching loops */ \
- while (failure > 0 && \
- (FAILURE_STR (failure) == string_place \
- || FAILURE_STR (failure) == NULL)) \
+ while (failure > 0 \
+ && (FAILURE_STR (failure) == string_place \
+ || FAILURE_STR (failure) == NULL)) \
{ \
assert (FAILURE_PAT (failure) >= bufp->buffer \
&& FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
if (FAILURE_PAT (failure) == pat_cur) \
- goto fail; \
+ { \
+ while (fail_stack.frame < fail_stack.avail) \
+ DISCARD_FAILURE_REG_OR_COUNT (); \
+ goto fail; \
+ } \
DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
failure = NEXT_FAILURE_HANDLE(failure); \
} \
static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
char *fastmap, const int multibyte));
-/* Fetch the next character in the uncompiled pattern---translating it
- if necessary. */
-#define PATFETCH(c) \
- do { \
- PATFETCH_RAW (c); \
- c = TRANSLATE (c); \
- } while (0)
-
/* Fetch the next character in the uncompiled pattern, with no
translation. */
-#define PATFETCH_RAW(c) \
+#define PATFETCH(c) \
do { \
int len; \
if (p == pend) return REG_EEND; \
/* But patterns can have more than `MAX_REGNUM' registers. We just
ignore the excess. */
-typedef unsigned regnum_t;
+typedef int regnum_t;
/* Macros for the compile stack. */
/* The next available element. */
#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
-
+/* Explicit quit checking is only used on NTemacs. */
+#if defined WINDOWSNT && defined emacs && defined QUIT
+extern int immediate_quit;
+# define IMMEDIATE_QUIT_CHECK \
+ do { \
+ if (immediate_quit) QUIT; \
+ } while (0)
+#else
+# define IMMEDIATE_QUIT_CHECK ((void)0)
+#endif
+\f
/* Structure to manage work area for range table. */
struct range_table_work_area
{
int bits; /* flag to record character classes */
};
-/* Make sure that WORK_AREA can hold more N multibyte characters. */
-#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
- do { \
- if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
- { \
- (work_area).allocated += 16 * sizeof (int); \
- if ((work_area).table) \
- (work_area).table \
- = (int *) realloc ((work_area).table, (work_area).allocated); \
- else \
- (work_area).table \
- = (int *) malloc ((work_area).allocated); \
- if ((work_area).table == 0) \
- FREE_STACK_RETURN (REG_ESPACE); \
- } \
+/* Make sure that WORK_AREA can hold more N multibyte characters.
+ This is used only in set_image_of_range and set_image_of_range_1.
+ It expects WORK_AREA to be a pointer.
+ If it can't get the space, it returns from the surrounding function. */
+
+#define EXTEND_RANGE_TABLE(work_area, n) \
+ do { \
+ if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
+ { \
+ extend_range_table_work_area (work_area); \
+ if ((work_area)->table == 0) \
+ return (REG_ESPACE); \
+ } \
} while (0)
#define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
#define BIT_UPPER 0x10
#define BIT_MULTIBYTE 0x20
-/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
-#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
+/* Set a range START..END to WORK_AREA.
+ The range is passed through TRANSLATE, so START and END
+ should be untranslated. */
+#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \
do { \
- EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
- (work_area).table[(work_area).used++] = (range_start); \
- (work_area).table[(work_area).used++] = (range_end); \
+ int tem; \
+ tem = set_image_of_range (&work_area, start, end, translate); \
+ if (tem > 0) \
+ FREE_STACK_RETURN (tem); \
} while (0)
/* Free allocated memory for WORK_AREA. */
#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
#define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
-
+\f
/* Set the bit for character C in a list. */
#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
do { if (p != pend) \
{ \
PATFETCH (c); \
+ if (c == ' ') \
+ FREE_STACK_RETURN (REG_BADBR); \
while ('0' <= c && c <= '9') \
{ \
+ int prev; \
if (num < 0) \
- num = 0; \
+ num = 0; \
+ prev = num; \
num = num * 10 + c - '0'; \
+ if (num / 10 != prev) \
+ FREE_STACK_RETURN (REG_BADBR); \
if (p == pend) \
- break; \
+ break; \
PATFETCH (c); \
} \
+ if (c == ' ') \
+ FREE_STACK_RETURN (REG_BADBR); \
} \
} while (0)
-
+\f
#if WIDE_CHAR_SUPPORT
/* The GNU C library provides support for user-defined character classes
and the functions from ISO C amendement 1. */
}
}
#endif
+\f
+/* Filling in the work area of a range. */
-/* Explicit quit checking is only used on NTemacs. */
-#if defined WINDOWSNT && defined emacs && defined QUIT
-extern int immediate_quit;
-# define IMMEDIATE_QUIT_CHECK \
- do { \
- if (immediate_quit) QUIT; \
- } while (0)
-#else
-# define IMMEDIATE_QUIT_CHECK ((void)0)
+/* Actually extend the space in WORK_AREA. */
+
+static void
+extend_range_table_work_area (work_area)
+ struct range_table_work_area *work_area;
+{
+ work_area->allocated += 16 * sizeof (int);
+ if (work_area->table)
+ work_area->table
+ = (int *) realloc (work_area->table, work_area->allocated);
+ else
+ work_area->table
+ = (int *) malloc (work_area->allocated);
+}
+
+#ifdef emacs
+
+/* Carefully find the ranges of codes that are equivalent
+ under case conversion to the range start..end when passed through
+ TRANSLATE. Handle the case where non-letters can come in between
+ two upper-case letters (which happens in Latin-1).
+ Also handle the case of groups of more than 2 case-equivalent chars.
+
+ The basic method is to look at consecutive characters and see
+ if they can form a run that can be handled as one.
+
+ Returns -1 if successful, REG_ESPACE if ran out of space. */
+
+static int
+set_image_of_range_1 (work_area, start, end, translate)
+ RE_TRANSLATE_TYPE translate;
+ struct range_table_work_area *work_area;
+ re_wchar_t start, end;
+{
+ /* `one_case' indicates a character, or a run of characters,
+ each of which is an isolate (no case-equivalents).
+ This includes all ASCII non-letters.
+
+ `two_case' indicates a character, or a run of characters,
+ each of which has two case-equivalent forms.
+ This includes all ASCII letters.
+
+ `strange' indicates a character that has more than one
+ case-equivalent. */
+
+ enum case_type {one_case, two_case, strange};
+
+ /* Describe the run that is in progress,
+ which the next character can try to extend.
+ If run_type is strange, that means there really is no run.
+ If run_type is one_case, then run_start...run_end is the run.
+ If run_type is two_case, then the run is run_start...run_end,
+ and the case-equivalents end at run_eqv_end. */
+
+ enum case_type run_type = strange;
+ int run_start, run_end, run_eqv_end;
+
+ Lisp_Object eqv_table;
+
+ if (!RE_TRANSLATE_P (translate))
+ {
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = (start);
+ work_area->table[work_area->used++] = (end);
+ return -1;
+ }
+
+ eqv_table = XCHAR_TABLE (translate)->extras[2];
+
+ for (; start <= end; start++)
+ {
+ enum case_type this_type;
+ int eqv = RE_TRANSLATE (eqv_table, start);
+ int minchar, maxchar;
+
+ /* Classify this character */
+ if (eqv == start)
+ this_type = one_case;
+ else if (RE_TRANSLATE (eqv_table, eqv) == start)
+ this_type = two_case;
+ else
+ this_type = strange;
+
+ if (start < eqv)
+ minchar = start, maxchar = eqv;
+ else
+ minchar = eqv, maxchar = start;
+
+ /* Can this character extend the run in progress? */
+ if (this_type == strange || this_type != run_type
+ || !(minchar == run_end + 1
+ && (run_type == two_case
+ ? maxchar == run_eqv_end + 1 : 1)))
+ {
+ /* No, end the run.
+ Record each of its equivalent ranges. */
+ if (run_type == one_case)
+ {
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = run_start;
+ work_area->table[work_area->used++] = run_end;
+ }
+ else if (run_type == two_case)
+ {
+ EXTEND_RANGE_TABLE (work_area, 4);
+ work_area->table[work_area->used++] = run_start;
+ work_area->table[work_area->used++] = run_end;
+ work_area->table[work_area->used++]
+ = RE_TRANSLATE (eqv_table, run_start);
+ work_area->table[work_area->used++]
+ = RE_TRANSLATE (eqv_table, run_end);
+ }
+ run_type = strange;
+ }
+
+ if (this_type == strange)
+ {
+ /* For a strange character, add each of its equivalents, one
+ by one. Don't start a range. */
+ do
+ {
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = eqv;
+ work_area->table[work_area->used++] = eqv;
+ eqv = RE_TRANSLATE (eqv_table, eqv);
+ }
+ while (eqv != start);
+ }
+
+ /* Add this char to the run, or start a new run. */
+ else if (run_type == strange)
+ {
+ /* Initialize a new range. */
+ run_type = this_type;
+ run_start = start;
+ run_end = start;
+ run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
+ }
+ else
+ {
+ /* Extend a running range. */
+ run_end = minchar;
+ run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
+ }
+ }
+
+ /* If a run is still in progress at the end, finish it now
+ by recording its equivalent ranges. */
+ if (run_type == one_case)
+ {
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = run_start;
+ work_area->table[work_area->used++] = run_end;
+ }
+ else if (run_type == two_case)
+ {
+ EXTEND_RANGE_TABLE (work_area, 4);
+ work_area->table[work_area->used++] = run_start;
+ work_area->table[work_area->used++] = run_end;
+ work_area->table[work_area->used++]
+ = RE_TRANSLATE (eqv_table, run_start);
+ work_area->table[work_area->used++]
+ = RE_TRANSLATE (eqv_table, run_end);
+ }
+
+ return -1;
+}
+
+#endif /* emacs */
+
+/* Record the the image of the range start..end when passed through
+ TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
+ and is not even necessarily contiguous.
+ Normally we approximate it with the smallest contiguous range that contains
+ all the chars we need. However, for Latin-1 we go to extra effort
+ to do a better job.
+
+ This function is not called for ASCII ranges.
+
+ Returns -1 if successful, REG_ESPACE if ran out of space. */
+
+static int
+set_image_of_range (work_area, start, end, translate)
+ RE_TRANSLATE_TYPE translate;
+ struct range_table_work_area *work_area;
+ re_wchar_t start, end;
+{
+ re_wchar_t cmin, cmax;
+
+#ifdef emacs
+ /* For Latin-1 ranges, use set_image_of_range_1
+ to get proper handling of ranges that include letters and nonletters.
+ For a range that includes the whole of Latin-1, this is not necessary.
+ For other character sets, we don't bother to get this right. */
+ if (RE_TRANSLATE_P (translate) && start < 04400
+ && !(start < 04200 && end >= 04377))
+ {
+ int newend;
+ int tem;
+ newend = end;
+ if (newend > 04377)
+ newend = 04377;
+ tem = set_image_of_range_1 (work_area, start, newend, translate);
+ if (tem > 0)
+ return tem;
+
+ start = 04400;
+ if (end < 04400)
+ return -1;
+ }
#endif
+
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = (start);
+ work_area->table[work_area->used++] = (end);
+
+ cmin = -1, cmax = -1;
+
+ if (RE_TRANSLATE_P (translate))
+ {
+ int ch;
+
+ for (ch = start; ch <= end; ch++)
+ {
+ re_wchar_t c = TRANSLATE (ch);
+ if (! (start <= c && c <= end))
+ {
+ if (cmin == -1)
+ cmin = c, cmax = c;
+ else
+ {
+ cmin = MIN (cmin, c);
+ cmax = MAX (cmax, c);
+ }
+ }
+ }
+
+ if (cmin != -1)
+ {
+ EXTEND_RANGE_TABLE (work_area, 2);
+ work_area->table[work_area->used++] = (cmin);
+ work_area->table[work_area->used++] = (cmax);
+ }
+ }
+
+ return -1;
+}
\f
#ifndef MATCH_MAY_ALLOCATE
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+ /* Don't translate yet. The range TRANSLATE(X..Y) cannot
+ always be determined from TRANSLATE(X) and TRANSLATE(Y)
+ So the translation is done later in a loop. Example:
+ (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
PATFETCH (c);
/* \ might escape characters inside [...] and [^...]. */
them). */
if (c == ':' && *p == ']')
{
- int ch;
+ re_wchar_t ch;
re_wctype_t cc;
cc = re_wctype (str);
starting at the smallest character in
the charset of C1 and ending at C1. */
int charset = CHAR_CHARSET (c1);
- int c2 = MAKE_CHAR (charset, 0, 0);
-
+ re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
+
SET_RANGE_TABLE_WORK_AREA (range_table_work,
c2, c1);
c1 = 0377;
/* ... into bitmap. */
{
re_wchar_t this_char;
- int range_start = c, range_end = c1;
+ re_wchar_t range_start = c, range_end = c1;
/* If the start is after the end, the range is empty. */
if (range_start > range_end)
/* Do not translate the character after the \, so that we can
distinguish, e.g., \B from \b, even if we normally would
translate, e.g., B to b. */
- PATFETCH_RAW (c);
+ PATFETCH (c);
switch (c)
{
case 'c':
laststart = b;
- PATFETCH_RAW (c);
+ PATFETCH (c);
BUF_PUSH_2 (categoryspec, c);
break;
case 'C':
laststart = b;
- PATFETCH_RAW (c);
+ PATFETCH (c);
BUF_PUSH_2 (notcategoryspec, c);
break;
#endif /* emacs */
/* You might think it would be useful for \ to mean
not to translate; but if we don't translate it
it will never match anything. */
- c = TRANSLATE (c);
goto normal_char;
}
break;
default:
/* Expects the character in `c'. */
normal_char:
- /* If no exactn currently being built. */
+ /* If no exactn currently being built. */
if (!pending_exact
/* If last exactn not at current position. */
{
int len;
+ c = TRANSLATE (c);
if (multibyte)
len = CHAR_STRING (c, b);
else
they don't overlap. The union of the two sets of excluded
chars should cover all possible chars, which, as a matter of
fact, is virtually impossible in multibyte buffers. */
- ;
+ break;
}
break;