internationalization features.)
Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
- 2002, 2003, 2004, 2005, 2006, 2007
+ 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
# define SYNTAX_ENTRY_VIA_PROPERTY
# include "syntax.h"
-# include "charset.h"
+# include "character.h"
# include "category.h"
# ifdef malloc
# define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
# define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
-# define RE_STRING_CHAR(p, s) \
+# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
+# define RE_STRING_CHAR(p, s, multibyte) \
(multibyte ? (STRING_CHAR (p, s)) : (*(p)))
-# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
+# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) \
(multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
-/* Set C a (possibly multibyte) character before P. P points into a
- string which is the virtual concatenation of STR1 (which ends at
- END1) or STR2 (which ends at END2). */
-# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
- do { \
- if (multibyte) \
- { \
- re_char *dtemp = (p) == (str2) ? (end1) : (p); \
- re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
- re_char *d0 = dtemp; \
- PREV_CHAR_BOUNDARY (d0, dlimit); \
- c = STRING_CHAR (d0, dtemp - d0); \
- } \
- else \
- (c = ((p) == (str2) ? (end1) : (p))[-1]); \
+# define RE_CHAR_TO_MULTIBYTE(c) unibyte_to_multibyte_table[(c)]
+
+# define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
+
+/* Set C a (possibly converted to multibyte) character before P. P
+ points into a string which is the virtual concatenation of STR1
+ (which ends at END1) or STR2 (which ends at END2). */
+# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
+ do { \
+ if (target_multibyte) \
+ { \
+ re_char *dtemp = (p) == (str2) ? (end1) : (p); \
+ re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
+ while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
+ c = STRING_CHAR (dtemp, (p) - dtemp); \
+ } \
+ else \
+ { \
+ (c = ((p) == (str2) ? (end1) : (p))[-1]); \
+ (c) = RE_CHAR_TO_MULTIBYTE (c); \
+ } \
} while (0)
+/* Set C a (possibly converted to multibyte) character at P, and set
+ LEN to the byte length of that character. */
+# define GET_CHAR_AFTER(c, p, len) \
+ do { \
+ if (target_multibyte) \
+ (c) = STRING_CHAR_AND_LENGTH (p, 0, len); \
+ else \
+ { \
+ (c) = *p; \
+ len = 1; \
+ (c) = RE_CHAR_TO_MULTIBYTE (c); \
+ } \
+ } while (0)
#else /* not emacs */
/* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
-# ifdef SWITCH_ENUM_BUG
-# define SWITCH_ENUM_CAST(x) ((int)(x))
-# else
# define SWITCH_ENUM_CAST(x) (x)
-# endif
/* Dummy macros for non-Emacs environments. */
# define BASE_LEADING_CODE_P(c) (0)
# define CHARSET_LEADING_CODE_BASE(c) 0
# define MAX_MULTIBYTE_LENGTH 1
# define RE_MULTIBYTE_P(x) 0
+# define RE_TARGET_MULTIBYTE_P(x) 0
# define WORD_BOUNDARY_P(c1, c2) (0)
# define CHAR_HEAD_P(p) (1)
# define SINGLE_BYTE_CHAR_P(c) (1)
# define MULTIBYTE_FORM_LENGTH(p, s) (1)
# define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
# define STRING_CHAR(p, s) (*(p))
-# define RE_STRING_CHAR STRING_CHAR
+# define RE_STRING_CHAR(p, s, multibyte) STRING_CHAR ((p), (s))
# define CHAR_STRING(c, s) (*(s) = (c), 1)
# define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
-# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
+# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) STRING_CHAR_AND_LENGTH ((p), (s), (len))
+# define RE_CHAR_TO_MULTIBYTE(c) (c)
+# define RE_CHAR_TO_UNIBYTE(c) (c)
# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
(c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
+# define GET_CHAR_AFTER(c, p, len) \
+ (c = *p, len = 1)
# define MAKE_CHAR(charset, c1, c2) (c1)
+# define BYTE8_TO_CHAR(c) (c)
+# define CHAR_BYTE8_P(c) (0)
+# define CHAR_LEADING_CODE(c) (c)
+
#endif /* not emacs */
#ifndef RE_TRANSLATE
# ifdef __GNUC__
# define alloca __builtin_alloca
# else /* not __GNUC__ */
-# if HAVE_ALLOCA_H
+# ifdef HAVE_ALLOCA_H
# include <alloca.h>
# endif /* HAVE_ALLOCA_H */
# endif /* not __GNUC__ */
/* Normally, this is fine. */
#define MATCH_MAY_ALLOCATE
-/* When using GNU C, we are not REALLY using the C alloca, no matter
- what config.h may say. So don't take precautions for it. */
-#ifdef __GNUC__
-# undef C_ALLOCA
-#endif
-
/* The match routines may not allocate if (1) they would do it with malloc
and (2) it's not safe for them to use malloc.
Note that if REL_ALLOC is defined, matching would not use malloc for the
failure stack, but we would still use it for the register vectors;
so REL_ALLOC should not affect this. */
-#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs
+#if defined REGEX_MALLOC && defined emacs
# undef MATCH_MAY_ALLOCATE
#endif
do { \
int len; \
if (p == pend) return REG_EEND; \
- c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
+ c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len, multibyte); \
p += len; \
} while (0)
being larger than MAX_BUF_SIZE, then flag memory exhausted. */
#if __BOUNDED_POINTERS__
# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
-# define MOVE_BUFFER_POINTER(P) \
- (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr)
+# define MOVE_BUFFER_POINTER(P) \
+ (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
+ SET_HIGH_BOUND (P), \
+ __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
# define ELSE_EXTEND_BUFFER_HIGH_BOUND \
else \
{ \
SET_HIGH_BOUND (pending_exact); \
}
#else
-# define MOVE_BUFFER_POINTER(P) (P) += incr
+# define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
# define ELSE_EXTEND_BUFFER_HIGH_BOUND
#endif
#define EXTEND_BUFFER() \
do { \
- re_char *old_buffer = bufp->buffer; \
+ unsigned char *old_buffer = bufp->buffer; \
if (bufp->allocated == MAX_BUF_SIZE) \
return REG_ESIZE; \
bufp->allocated <<= 1; \
/* If the buffer moved, move all the pointers into it. */ \
if (old_buffer != bufp->buffer) \
{ \
- int incr = bufp->buffer - old_buffer; \
+ unsigned char *new_buffer = bufp->buffer; \
MOVE_BUFFER_POINTER (b); \
MOVE_BUFFER_POINTER (begalt); \
if (fixup_alt_jump) \
#define EXTEND_RANGE_TABLE(work_area, n) \
do { \
- if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \
+ if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
{ \
- extend_range_table_work_area (work_area); \
- if ((work_area)->table == 0) \
+ extend_range_table_work_area (&work_area); \
+ if ((work_area).table == 0) \
return (REG_ESPACE); \
} \
} while (0)
#define BIT_UPPER 0x10
#define BIT_MULTIBYTE 0x20
-/* Set a range START..END to WORK_AREA.
- The range is passed through TRANSLATE, so START and END
- should be untranslated. */
-#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \
+/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
+#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
do { \
- int tem; \
- tem = set_image_of_range (&work_area, start, end, translate); \
- if (tem > 0) \
- FREE_STACK_RETURN (tem); \
+ EXTEND_RANGE_TABLE ((work_area), 2); \
+ (work_area).table[(work_area).used++] = (range_start); \
+ (work_area).table[(work_area).used++] = (range_end); \
} while (0)
/* Free allocated memory for WORK_AREA. */
#define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
+#ifdef emacs
+
+/* Store characters in the range FROM to TO in the bitmap at B (for
+ ASCII and unibyte characters) and WORK_AREA (for multibyte
+ characters) while translating them and paying attention to the
+ continuity of translated characters.
+
+ Implementation note: It is better to implement these fairly big
+ macros by a function, but it's not that easy because macros called
+ in this macro assume various local variables already declared. */
+
+/* Both FROM and TO are ASCII characters. */
+
+#define SETUP_ASCII_RANGE(work_area, FROM, TO) \
+ do { \
+ int C0, C1; \
+ \
+ for (C0 = (FROM); C0 <= (TO); C0++) \
+ { \
+ C1 = TRANSLATE (C0); \
+ if (! ASCII_CHAR_P (C1)) \
+ { \
+ SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
+ if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
+ C1 = C0; \
+ } \
+ SET_LIST_BIT (C1); \
+ } \
+ } while (0)
+
+
+/* Both FROM and TO are unibyte characters (0x80..0xFF). */
+
+#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
+ do { \
+ int C0, C1, C2, I; \
+ int USED = RANGE_TABLE_WORK_USED (work_area); \
+ \
+ for (C0 = (FROM); C0 <= (TO); C0++) \
+ { \
+ C1 = RE_CHAR_TO_MULTIBYTE (C0); \
+ if (CHAR_BYTE8_P (C1)) \
+ SET_LIST_BIT (C0); \
+ else \
+ { \
+ C2 = TRANSLATE (C1); \
+ if (C2 == C1 \
+ || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
+ C1 = C0; \
+ SET_LIST_BIT (C1); \
+ for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
+ { \
+ int from = RANGE_TABLE_WORK_ELT (work_area, I); \
+ int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
+ \
+ if (C2 >= from - 1 && C2 <= to + 1) \
+ { \
+ if (C2 == from - 1) \
+ RANGE_TABLE_WORK_ELT (work_area, I)--; \
+ else if (C2 == to + 1) \
+ RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
+ break; \
+ } \
+ } \
+ if (I < USED) \
+ SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
+ } \
+ } \
+ } while (0)
+
+
+/* Both FROM and TO are mulitbyte characters. */
+
+#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
+ do { \
+ int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
+ \
+ SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
+ for (C0 = (FROM); C0 <= (TO); C0++) \
+ { \
+ C1 = TRANSLATE (C0); \
+ if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
+ || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
+ SET_LIST_BIT (C2); \
+ if (C1 >= (FROM) && C1 <= (TO)) \
+ continue; \
+ for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
+ { \
+ int from = RANGE_TABLE_WORK_ELT (work_area, I); \
+ int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
+ \
+ if (C1 >= from - 1 && C1 <= to + 1) \
+ { \
+ if (C1 == from - 1) \
+ RANGE_TABLE_WORK_ELT (work_area, I)--; \
+ else if (C1 == to + 1) \
+ RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
+ break; \
+ } \
+ } \
+ if (I < USED) \
+ SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
+ } \
+ } while (0)
+
+#endif /* emacs */
+
/* Get the next unsigned number in the uncompiled pattern. */
#define GET_UNSIGNED_NUMBER(num) \
do { \
= (int *) malloc (work_area->allocated);
}
+#if 0
#ifdef emacs
/* Carefully find the ranges of codes that are equivalent
return -1;
}
+#endif /* 0 */
\f
#ifndef MATCH_MAY_ALLOCATE
/* If the object matched can contain multibyte characters. */
const boolean multibyte = RE_MULTIBYTE_P (bufp);
+ /* If a target of matching can contain multibyte characters. */
+ const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
+
/* Nonzero if we have pushed down into a subpattern. */
int in_subpattern = 0;
{
boolean escaped_char = false;
const unsigned char *p2 = p;
+ re_wchar_t ch, c2;
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
break;
}
- /* What should we do for the character which is
- greater than 0x7F, but not BASE_LEADING_CODE_P?
- XXX */
-
/* See if we're at the beginning of a possible character
class. */
them). */
if (c == ':' && *p == ']')
{
- re_wchar_t ch;
re_wctype_t cc;
+ int limit;
cc = re_wctype (str);
if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
+#ifndef emacs
+ for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
+ if (re_iswctype (btowc (ch), cc))
+ {
+ c = TRANSLATE (ch);
+ if (c < (1 << BYTEWIDTH))
+ SET_LIST_BIT (c);
+ }
+#else /* emacs */
/* Most character classes in a multibyte match
just set a flag. Exceptions are is_blank,
is_digit, is_cntrl, and is_xdigit, since
don't need to handle them for multibyte.
They are distinguished by a negative wctype. */
- if (multibyte)
- SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
- re_wctype_to_bit (cc));
-
- for (ch = 0; ch < 1 << BYTEWIDTH; ++ch)
+ for (ch = 0; ch < 256; ++ch)
{
- int translated = TRANSLATE (ch);
- if (translated < (1 << BYTEWIDTH)
- && re_iswctype (btowc (ch), cc))
- SET_LIST_BIT (translated);
+ c = RE_CHAR_TO_MULTIBYTE (ch);
+ if (! CHAR_BYTE8_P (c)
+ && re_iswctype (c, cc))
+ {
+ SET_LIST_BIT (ch);
+ c1 = TRANSLATE (c);
+ if (c1 == c)
+ continue;
+ if (ASCII_CHAR_P (c1))
+ SET_LIST_BIT (c1);
+ else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
+ SET_LIST_BIT (c1);
+ }
}
-
+ SET_RANGE_TABLE_WORK_AREA_BIT
+ (range_table_work, re_wctype_to_bit (cc));
+#endif /* emacs */
/* In most cases the matching rule for char classes
only uses the syntax table for multibyte chars,
so that the content of the syntax-table it is not
/* Fetch the character which ends the range. */
PATFETCH (c1);
-
- if (SINGLE_BYTE_CHAR_P (c))
- {
- if (! SINGLE_BYTE_CHAR_P (c1))
- {
- /* Handle a range starting with a
- character of less than 256, and ending
- with a character of not less than 256.
- Split that into two ranges, the low one
- ending at 0377, and the high one
- starting at the smallest character in
- the charset of C1 and ending at C1. */
- int charset = CHAR_CHARSET (c1);
- re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
-
- SET_RANGE_TABLE_WORK_AREA (range_table_work,
- c2, c1);
- c1 = 0377;
- }
- }
- else if (!SAME_CHARSET_P (c, c1))
- FREE_STACK_RETURN (REG_ERANGEX);
+#ifdef emacs
+ if (CHAR_BYTE8_P (c1)
+ && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
+ /* Treat the range from a multibyte character to
+ raw-byte character as empty. */
+ c = c1 + 1;
+#endif /* emacs */
}
else
/* Range from C to C. */
c1 = c;
- /* Set the range ... */
- if (SINGLE_BYTE_CHAR_P (c))
- /* ... into bitmap. */
+ if (c > c1)
{
- re_wchar_t this_char;
- re_wchar_t range_start = c, range_end = c1;
-
- /* If the start is after the end, the range is empty. */
- if (range_start > range_end)
+ if (syntax & RE_NO_EMPTY_RANGES)
+ FREE_STACK_RETURN (REG_ERANGEX);
+ /* Else, repeat the loop. */
+ }
+ else
+ {
+#ifndef emacs
+ /* Set the range into bitmap */
+ for (; c <= c1; c++)
{
- if (syntax & RE_NO_EMPTY_RANGES)
- FREE_STACK_RETURN (REG_ERANGE);
- /* Else, repeat the loop. */
+ ch = TRANSLATE (c);
+ if (ch < (1 << BYTEWIDTH))
+ SET_LIST_BIT (ch);
}
- else
+#else /* emacs */
+ if (c < 128)
{
- for (this_char = range_start; this_char <= range_end;
- this_char++)
+ ch = MIN (127, c1);
+ SETUP_ASCII_RANGE (range_table_work, c, ch);
+ c = ch + 1;
+ if (CHAR_BYTE8_P (c1))
+ c = BYTE8_TO_CHAR (128);
+ }
+ if (c <= c1)
+ {
+ if (CHAR_BYTE8_P (c))
+ {
+ c = CHAR_TO_BYTE8 (c);
+ c1 = CHAR_TO_BYTE8 (c1);
+ for (; c <= c1; c++)
+ SET_LIST_BIT (c);
+ }
+ else if (multibyte)
+ {
+ SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
+ }
+ else
{
- int translated = TRANSLATE (this_char);
- if (translated < (1 << BYTEWIDTH))
- SET_LIST_BIT (translated);
- else
- SET_RANGE_TABLE_WORK_AREA
- (range_table_work, translated, translated);
+ SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
}
}
+#endif /* emacs */
}
- else
- /* ... into range table. */
- SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
}
/* Discard any (non)matching list bytes that are all 0 at the
{
int len;
- c = TRANSLATE (c);
if (multibyte)
- len = CHAR_STRING (c, b);
+ {
+ c = TRANSLATE (c);
+ len = CHAR_STRING (c, b);
+ b += len;
+ }
else
- *b = c, len = 1;
- b += len;
+ {
+ c1 = RE_CHAR_TO_MULTIBYTE (c);
+ if (! CHAR_BYTE8_P (c1))
+ {
+ re_wchar_t c2 = TRANSLATE (c1);
+
+ if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
+ c = c1;
+ }
+ *b++ = c;
+ len = 1;
+ }
(*pending_exact) += len;
}
case exactn:
if (fastmap)
{
- int c = RE_STRING_CHAR (p + 1, pend - p);
- /* When fast-scanning, the fastmap can be indexed either with
- a char (smaller than 256) or with the first byte of
- a char's byte sequence. So we have to conservatively add
- both to the table. */
- if (SINGLE_BYTE_CHAR_P (c))
- fastmap[c] = 1;
+ /* If multibyte is nonzero, the first byte of each
+ character is an ASCII or a leading code. Otherwise,
+ each byte is a character. Thus, this works in both
+ cases. */
fastmap[p[1]] = 1;
+ if (! multibyte)
+ {
+ /* For the case of matching this unibyte regex
+ against multibyte, we must set a leading code of
+ the corresponding multibyte character. */
+ int c = RE_CHAR_TO_MULTIBYTE (p[1]);
+
+ if (! CHAR_BYTE8_P (c))
+ fastmap[CHAR_LEADING_CODE (c)] = 1;
+ }
}
break;
case charset_not:
- /* Chars beyond end of bitmap are possible matches.
- All the single-byte codes can occur in multibyte buffers.
- So any that are not listed in the charset
- are possible matches, even in multibyte buffers. */
if (!fastmap) break;
- /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
- because it will automatically be set when needed by virtue of
- being larger than the highest char of its charset (0xbf) but
- smaller than (1<<BYTEWIDTH). */
- for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
- j < (1 << BYTEWIDTH); j++)
- fastmap[j] = 1;
+ {
+ /* Chars beyond end of bitmap are possible matches. */
+ for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
+ j < (1 << BYTEWIDTH); j++)
+ fastmap[j] = 1;
+ }
+
/* Fallthrough */
case charset:
if (!fastmap) break;
for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
j >= 0; j--)
if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
- {
- fastmap[j] = 1;
-#ifdef emacs
- if (j >= 0x80 && j < 0xa0)
- fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
-#endif
- }
+ fastmap[j] = 1;
- if ((not && multibyte)
- /* Any character set can possibly contain a character
+#ifdef emacs
+ if (/* Any leading code can possibly start a character
which doesn't match the specified set of characters. */
- || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
- && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
- /* If we can match a character class, we can match
- any character set. */
+ not
+ ||
+ /* If we can match a character class, we can match any
+ multibyte characters. */
+ (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
+ && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
+
{
- set_fastmap_for_multibyte_characters:
if (match_any_multibyte_characters == false)
{
- for (j = 0x80; j < 0xA0; j++) /* XXX */
- if (BASE_LEADING_CODE_P (j))
- fastmap[j] = 1;
+ for (j = MIN_MULTIBYTE_LEADING_CODE;
+ j <= MAX_MULTIBYTE_LEADING_CODE; j++)
+ fastmap[j] = 1;
match_any_multibyte_characters = true;
}
}
else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
&& match_any_multibyte_characters == false)
{
- /* Set fastmap[I] 1 where I is a base leading code of each
- multibyte character in the range table. */
+ /* Set fastmap[I] to 1 where I is a leading code of each
+ multibyte characer in the range table. */
int c, count;
+ unsigned char lc1, lc2;
/* Make P points the range table. `+ 2' is to skip flag
bits for a character class. */
/* Extract the number of ranges in range table into COUNT. */
EXTRACT_NUMBER_AND_INCR (count, p);
- for (; count > 0; count--, p += 2 * 3) /* XXX */
+ for (; count > 0; count--, p += 3)
{
- /* Extract the start of each range. */
+ /* Extract the start and end of each range. */
+ EXTRACT_CHARACTER (c, p);
+ lc1 = CHAR_LEADING_CODE (c);
+ p += 3;
EXTRACT_CHARACTER (c, p);
- j = CHAR_CHARSET (c);
- fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
+ lc2 = CHAR_LEADING_CODE (c);
+ for (j = lc1; j <= lc2; j++)
+ fastmap[j] = 1;
}
}
+#endif
break;
case syntaxspec:
if (!fastmap) break;
not = (re_opcode_t)p[-1] == notcategoryspec;
k = *p++;
- for (j = 0; j < (1 << BYTEWIDTH); j++)
+ for (j = (1 << BYTEWIDTH); j >= 0; j--)
if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
fastmap[j] = 1;
- if (multibyte)
- /* Any character set can possibly contain a character
- whose category is K (or not). */
- goto set_fastmap_for_multibyte_characters;
+ /* Any leading code can possibly start a character which
+ has or doesn't has the specified category. */
+ if (match_any_multibyte_characters == false)
+ {
+ for (j = MIN_MULTIBYTE_LEADING_CODE;
+ j <= MAX_MULTIBYTE_LEADING_CODE; j++)
+ fastmap[j] = 1;
+ match_any_multibyte_characters = true;
+ }
break;
/* All cases after this match the empty string. These end with
int total_size = size1 + size2;
int endpos = startpos + range;
boolean anchored_start;
-
- /* Nonzero if we have to concern multibyte character. */
- const boolean multibyte = RE_MULTIBYTE_P (bufp);
+ /* Nonzero if we are searching multibyte string. */
+ const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
/* Check for out-of-range STARTPOS. */
if (startpos < 0 || startpos > total_size)
buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
buf_charlen);
-
buf_ch = RE_TRANSLATE (translate, buf_ch);
- if (buf_ch >= 0400
- || fastmap[buf_ch])
+ if (fastmap[CHAR_LEADING_CODE (buf_ch)])
break;
range -= buf_charlen;
d += buf_charlen;
}
else
- {
- /* Convert *d to integer to shut up GCC's
- whining about comparison that is always
- true. */
- int di = *d;
-
- while (range > lim
- && !fastmap[RE_TRANSLATE (translate, di)])
- {
- di = *(++d);
- range--;
- }
- }
- }
- else
- do
- {
- re_char *d_start = d;
- while (range > lim && !fastmap[*d])
+ while (range > lim)
{
+ register re_wchar_t ch, translated;
+
+ buf_ch = *d;
+ ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
+ translated = RE_TRANSLATE (translate, ch);
+ if (translated != ch
+ && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
+ buf_ch = ch;
+ if (fastmap[buf_ch])
+ break;
d++;
range--;
}
-#ifdef emacs
- if (multibyte && range > lim)
+ }
+ else
+ {
+ if (multibyte)
+ while (range > lim)
{
- /* Check that we are at the beginning of a char. */
- int at_boundary;
- AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
- if (at_boundary)
+ int buf_charlen;
+
+ buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
+ buf_charlen);
+ if (fastmap[CHAR_LEADING_CODE (buf_ch)])
break;
- else
- { /* We have matched an internal byte of a char
- rather than the leading byte, so it's a false
- positive: we should keep scanning. */
- d++; range--;
- }
+ range -= buf_charlen;
+ d += buf_charlen;
}
- else
-#endif
- break;
- } while (1);
-
+ else
+ while (range > lim && !fastmap[*d])
+ {
+ d++;
+ range--;
+ }
+ }
startpos += irange - range;
}
else /* Searching backwards. */
int room = (startpos >= size1
? size2 + size1 - startpos
: size1 - startpos);
- buf_ch = RE_STRING_CHAR (d, room);
- buf_ch = TRANSLATE (buf_ch);
-
- if (! (buf_ch >= 0400
- || fastmap[buf_ch]))
- goto advance;
+ if (multibyte)
+ {
+ buf_ch = STRING_CHAR (d, room);
+ buf_ch = TRANSLATE (buf_ch);
+ if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
+ goto advance;
+ }
+ else
+ {
+ register re_wchar_t ch, translated;
+
+ buf_ch = *d;
+ ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
+ translated = TRANSLATE (ch);
+ if (translated != ch
+ && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
+ buf_ch = ch;
+ if (! fastmap[TRANSLATE (buf_ch)])
+ goto advance;
+ }
}
}
val = re_match_2_internal (bufp, string1, size1, string2, size2,
startpos, regs, stop);
-#ifndef REGEX_MALLOC
-# ifdef C_ALLOCA
- alloca (0);
-# endif
-#endif
if (val >= 0)
return startpos;
{
register re_wchar_t c
= (re_opcode_t) *p2 == endline ? '\n'
- : RE_STRING_CHAR (p2 + 2, pend - p2 - 2);
+ : RE_STRING_CHAR (p2 + 2, pend - p2 - 2, multibyte);
if ((re_opcode_t) *p1 == exactn)
{
- if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
+ if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2, multibyte))
{
DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
return 1;
/* Test if C is listed in charset (or charset_not)
at `p1'. */
- if (SINGLE_BYTE_CHAR_P (c))
+ if (! multibyte || IS_REAL_ASCII (c))
{
if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
&& p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
size of bitmap table of P1 is extracted by
using macro `CHARSET_BITMAP_SIZE'.
- Since we know that all the character listed in
- P2 is ASCII, it is enough to test only bitmap
- table of P1. */
+ In a multibyte case, we know that all the character
+ listed in P2 is ASCII. In a unibyte case, P1 has only a
+ bitmap table. So, in both cases, it is enough to test
+ only the bitmap table of P1. */
if ((re_opcode_t) *p1 == charset)
{
{
int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
pos, regs, size);
-# if defined C_ALLOCA && !defined REGEX_MALLOC
- alloca (0);
-# endif
return result;
}
WEAK_ALIAS (__re_match, re_match)
result = re_match_2_internal (bufp, (re_char*) string1, size1,
(re_char*) string2, size2,
pos, regs, stop);
-#if defined C_ALLOCA && !defined REGEX_MALLOC
- alloca (0);
-#endif
return result;
}
WEAK_ALIAS (__re_match_2, re_match_2)
+
/* This is a separate function so that we can force an alloca cleanup
afterwards. */
static int
re_char *p = bufp->buffer;
re_char *pend = p + bufp->used;
- /* We use this to map every character in the string. */
+ /* We use this to map every character in the string. */
RE_TRANSLATE_TYPE translate = bufp->translate;
- /* Nonzero if we have to concern multibyte character. */
+ /* Nonzero if BUFP is setup from a multibyte regex. */
const boolean multibyte = RE_MULTIBYTE_P (bufp);
+ /* Nonzero if STRING1/STRING2 are multibyte. */
+ const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
+
/* Failure point stack. Each place that can handle a failure further
down the line pushes a failure point on this stack. It consists of
regstart, and regend for all registers corresponding to
/* Remember the start point to rollback upon failure. */
dfail = d;
+#ifndef emacs
/* This is written out as an if-else so we don't waste time
testing `translate' inside the loop. */
if (RE_TRANSLATE_P (translate))
- {
- if (multibyte)
- do
+ do
+ {
+ PREFETCH ();
+ if (RE_TRANSLATE (translate, *d) != *p++)
{
- int pat_charlen, buf_charlen;
- unsigned int pat_ch, buf_ch;
+ d = dfail;
+ goto fail;
+ }
+ d++;
+ }
+ while (--mcnt);
+ else
+ do
+ {
+ PREFETCH ();
+ if (*d++ != *p++)
+ {
+ d = dfail;
+ goto fail;
+ }
+ }
+ while (--mcnt);
+#else /* emacs */
+ /* The cost of testing `translate' is comparatively small. */
+ if (target_multibyte)
+ do
+ {
+ int pat_charlen, buf_charlen;
+ int pat_ch, buf_ch;
- PREFETCH ();
- pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
- buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+ PREFETCH ();
+ if (multibyte)
+ pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+ else
+ {
+ pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
+ pat_charlen = 1;
+ }
+ buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
- if (RE_TRANSLATE (translate, buf_ch)
- != pat_ch)
- {
- d = dfail;
- goto fail;
- }
+ if (TRANSLATE (buf_ch) != pat_ch)
+ {
+ d = dfail;
+ goto fail;
+ }
+
+ p += pat_charlen;
+ d += buf_charlen;
+ mcnt -= pat_charlen;
+ }
+ while (mcnt > 0);
+ else
+ do
+ {
+ int pat_charlen, buf_charlen;
+ int pat_ch, buf_ch;
- p += pat_charlen;
- d += buf_charlen;
- mcnt -= pat_charlen;
+ PREFETCH ();
+ if (multibyte)
+ {
+ pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
+ pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
}
- while (mcnt > 0);
- else
- do
+ else
{
- /* Avoid compiler whining about comparison being
- always true. */
- int di;
-
- PREFETCH ();
- di = *d;
- if (RE_TRANSLATE (translate, di) != *p++)
- {
- d = dfail;
- goto fail;
- }
- d++;
+ pat_ch = *p;
+ pat_charlen = 1;
}
- while (--mcnt);
- }
- else
- {
- do
- {
- PREFETCH ();
- if (*d++ != *p++)
- {
- d = dfail;
- goto fail;
- }
- }
- while (--mcnt);
- }
+ buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
+ if (! CHAR_BYTE8_P (buf_ch))
+ {
+ buf_ch = TRANSLATE (buf_ch);
+ buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
+ if (buf_ch < 0)
+ buf_ch = *d;
+ }
+ else
+ buf_ch = *d;
+ if (buf_ch != pat_ch)
+ {
+ d = dfail;
+ goto fail;
+ }
+ p += pat_charlen;
+ d++;
+ }
+ while (--mcnt);
+#endif
break;
DEBUG_PRINT1 ("EXECUTING anychar.\n");
PREFETCH ();
- buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
+ buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen,
+ target_multibyte);
buf_ch = TRANSLATE (buf_ch);
if ((!(bufp->syntax & RE_DOT_NEWLINE)
in the initial byte-length of the command. */
int count = 0;
+ /* Whether matching against a unibyte character. */
+ boolean unibyte_char = false;
+
DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
}
PREFETCH ();
- c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
- c = TRANSLATE (c); /* The character to match. */
+ c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len, target_multibyte);
+ if (target_multibyte)
+ {
+ int c1;
+
+ c = TRANSLATE (c);
+ c1 = RE_CHAR_TO_UNIBYTE (c);
+ if (c1 >= 0)
+ {
+ unibyte_char = true;
+ c = c1;
+ }
+ }
+ else
+ {
+ int c1 = RE_CHAR_TO_MULTIBYTE (c);
- if (SINGLE_BYTE_CHAR_P (c))
+ if (! CHAR_BYTE8_P (c1))
+ {
+ c1 = TRANSLATE (c1);
+ c1 = RE_CHAR_TO_UNIBYTE (c1);
+ if (c1 >= 0)
+ {
+ unibyte_char = true;
+ c = c1;
+ }
+ }
+ else
+ unibyte_char = true;
+ }
+
+ if (unibyte_char && c < (1 << BYTEWIDTH))
{ /* Lookup bitmap. */
/* Cast to `unsigned' instead of `unsigned char' in
case the bit list is a full 32 bytes long. */
/* Compare that many; failure if mismatch, else move
past them. */
if (RE_TRANSLATE_P (translate)
- ? bcmp_translate (d, d2, mcnt, translate, multibyte)
+ ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
: memcmp (d, d2, mcnt))
{
d = dfail;
}
else
{
- unsigned char c;
+ unsigned c;
GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
if (c == '\n')
break;
is the character at D, and S2 is the syntax of C2. */
re_wchar_t c1, c2;
int s1, s2;
+ int dummy;
#ifdef emacs
int offset = PTR_TO_OFFSET (d - 1);
int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
#endif
PREFETCH_NOLIMIT ();
- c2 = RE_STRING_CHAR (d, dend - d);
+ GET_CHAR_AFTER (c2, d, dummy);
s2 = SYNTAX (c2);
if (/* Case 2: Only one of S1 and S2 is Sword. */
is the character at D, and S2 is the syntax of C2. */
re_wchar_t c1, c2;
int s1, s2;
+ int dummy;
#ifdef emacs
int offset = PTR_TO_OFFSET (d);
int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
UPDATE_SYNTAX_TABLE (charpos);
#endif
PREFETCH ();
- c2 = RE_STRING_CHAR (d, dend - d);
+ GET_CHAR_AFTER (c2, d, dummy);
s2 = SYNTAX (c2);
/* Case 2: S2 is not Sword. */
is the character at D, and S2 is the syntax of C2. */
re_wchar_t c1, c2;
int s1, s2;
+ int dummy;
#ifdef emacs
int offset = PTR_TO_OFFSET (d) - 1;
int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
if (!AT_STRINGS_END (d))
{
PREFETCH_NOLIMIT ();
- c2 = RE_STRING_CHAR (d, dend - d);
+ GET_CHAR_AFTER (c2, d, dummy);
#ifdef emacs
- UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
+ UPDATE_SYNTAX_TABLE_FORWARD (charpos);
#endif
s2 = SYNTAX (c2);
UPDATE_SYNTAX_TABLE (charpos);
#endif
PREFETCH ();
- c2 = RE_STRING_CHAR (d, dend - d);
+ c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
s2 = SYNTAX (c2);
/* Case 2: S2 is neither Sword nor Ssymbol. */
if (!AT_STRINGS_END (d))
{
PREFETCH_NOLIMIT ();
- c2 = RE_STRING_CHAR (d, dend - d);
+ c2 = RE_STRING_CHAR (d, dend - d, target_multibyte);
#ifdef emacs
UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
#endif
int len;
re_wchar_t c;
- c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
-
+ GET_CHAR_AFTER (c, d, len);
if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
goto fail;
d += len;
int len;
re_wchar_t c;
- c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
-
+ GET_CHAR_AFTER (c, d, len);
if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
goto fail;
d += len;
bytes; nonzero otherwise. */
static int
-bcmp_translate (s1, s2, len, translate, multibyte)
+bcmp_translate (s1, s2, len, translate, target_multibyte)
re_char *s1, *s2;
register int len;
RE_TRANSLATE_TYPE translate;
- const int multibyte;
+ const int target_multibyte;
{
register re_char *p1 = s1, *p2 = s2;
re_char *p1_end = s1 + len;
int p1_charlen, p2_charlen;
re_wchar_t p1_ch, p2_ch;
- p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
- p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
+ GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
+ GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
if (RE_TRANSLATE (translate, p1_ch)
!= RE_TRANSLATE (translate, p2_ch))
regfree (preg)
regex_t *preg;
{
- if (preg->buffer != NULL)
- free (preg->buffer);
+ free (preg->buffer);
preg->buffer = NULL;
preg->allocated = 0;
preg->used = 0;
- if (preg->fastmap != NULL)
- free (preg->fastmap);
+ free (preg->fastmap);
preg->fastmap = NULL;
preg->fastmap_accurate = 0;
- if (preg->translate != NULL)
- free (preg->translate);
+ free (preg->translate);
preg->translate = NULL;
}
WEAK_ALIAS (__regfree, regfree)