/* String search routines for GNU Emacs.
- Copyright (C) 1985, 86,87,93,94,97,98, 1999, 2004
- Free Software Foundation, Inc.
+ Copyright (C) 1985, 1986, 1987, 1993, 1994, 1997, 1998, 1999, 2002, 2003,
+ 2004, 2005, 2006 Free Software Foundation, Inc.
This file is part of GNU Emacs.
static int simple_search ();
static int boyer_moore ();
static int search_buffer ();
+static void matcher_overflow () NO_RETURN;
static void
matcher_overflow ()
DEFUN ("string-match", Fstring_match, Sstring_match, 2, 3, 0,
doc: /* Return index of start of first match for REGEXP in STRING, or nil.
-Case is ignored if `case-fold-search' is non-nil in the current buffer.
+Matching ignores case if `case-fold-search' is non-nil.
If third arg START is non-nil, start search at that index in STRING.
For index of first char beyond the match, do (match-end 0).
`match-end' and `match-beginning' also give indices of substrings
int raw_pattern_size_byte;
unsigned char *patbuf;
int multibyte = !NILP (current_buffer->enable_multibyte_characters);
- unsigned char *base_pat = SDATA (string);
- /* Set to nozero if we find a non-ASCII char that need
- translation. */
- int char_base = 0;
+ unsigned char *base_pat;
+ /* Set to positive if we find a non-ASCII char that need
+ translation. Otherwise set to zero later. */
+ int char_base = -1;
int boyer_moore_ok = 1;
/* MULTIBYTE says whether the text to be searched is multibyte.
if (RE && *base_pat == '\\')
{
len--;
+ raw_pattern_size--;
len_byte--;
base_pat++;
}
{
/* Check if all equivalents belong to the same
group of characters. Note that the check of C
- itself is done by the last iteration. Note
- also that we don't have to check ASCII
- characters because boyer-moore search can
- always handle their translation. */
- while (1)
+ itself is done by the last iteration. */
+ int this_char_base = -1;
+
+ while (boyer_moore_ok)
{
- if (! ASCII_BYTE_P (inverse))
+ if (ASCII_BYTE_P (inverse))
{
- if (CHAR_BYTE8_P (inverse))
+ if (this_char_base > 0)
+ boyer_moore_ok = 0;
+ else
{
- /* Boyer-moore search can't handle a
- translation of an eight-bit
- character. */
- boyer_moore_ok = 0;
- break;
- }
- else if (char_base == 0)
- char_base = inverse & ~0x3F;
- else if ((inverse & ~0x3F)
- != char_base)
- {
- boyer_moore_ok = 0;
- break;
+ this_char_base = 0;
+ if (char_base < 0)
+ char_base = this_char_base;
}
}
+ else if (CHAR_BYTE8_P (inverse))
+ /* Boyer-moore search can't handle a
+ translation of an eight-bit
+ character. */
+ boyer_moore_ok = 0;
+ else if (this_char_base < 0)
+ {
+ this_char_base = inverse & ~0x3F;
+ if (char_base < 0)
+ char_base = this_char_base;
+ else if (char_base > 0
+ && this_char_base != char_base)
+ boyer_moore_ok = 0;
+ }
+ else if ((inverse & ~0x3F) != this_char_base)
+ boyer_moore_ok = 0;
if (c == inverse)
break;
TRANSLATE (inverse, inverse_trt, inverse);
}
}
}
+ if (char_base < 0)
+ char_base = 0;
/* Store this character into the translated pattern. */
bcopy (str, pat, charlen);
{
int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
int forward = n > 0;
+ /* Number of buffer bytes matched. Note that this may be different
+ from len_byte in a multibyte buffer. */
+ int match_byte;
if (lim > pos && multibyte)
while (n > 0)
if (this_len == 0)
{
+ match_byte = this_pos_byte - pos_byte;
pos += len;
- pos_byte += len_byte;
+ pos_byte += match_byte;
break;
}
if (this_len == 0)
{
+ match_byte = len;
pos += len;
break;
}
if (pos - len < lim)
goto stop;
this_pos_byte = CHAR_TO_BYTE (this_pos);
+ match_byte = pos_byte - this_pos_byte;
while (this_len > 0)
{
if (this_len == 0)
{
pos -= len;
- pos_byte -= len_byte;
+ pos_byte -= match_byte;
break;
}
if (this_len == 0)
{
+ match_byte = len;
pos -= len;
break;
}
if (n == 0)
{
if (forward)
- set_search_regs ((multibyte ? pos_byte : pos) - len_byte, len_byte);
+ set_search_regs ((multibyte ? pos_byte : pos) - match_byte, match_byte);
else
- set_search_regs (multibyte ? pos_byte : pos, len_byte);
+ set_search_regs (multibyte ? pos_byte : pos, match_byte);
return pos;
}
unsigned char simple_translate[0400];
/* These are set to the preceding bytes of a byte to be translated
- if charset_base is nonzero. As the maximum byte length of a
+ if char_base is nonzero. As the maximum byte length of a
multibyte character is 5, we have to check at most four previous
bytes. */
int translate_prev_byte1 = 0;
i = infinity;
if (! NILP (trt))
{
- /* If the byte currently looking at is a head of a character
- to check case-equivalents, set CH to that character. An
- ASCII character and a non-ASCII character matching with
- CHAR_BASE are to be checked. */
+ /* If the byte currently looking at is the last of a
+ character to check case-equivalents, set CH to that
+ character. An ASCII character and a non-ASCII character
+ matching with CHAR_BASE are to be checked. */
int ch = -1;
if (ASCII_BYTE_P (*ptr) || ! multibyte)
ch = *ptr;
- else if (char_base && CHAR_HEAD_P (*ptr))
+ else if (char_base
+ && ((pat_end - ptr) == 1 || CHAR_HEAD_P (ptr[1])))
{
- ch = STRING_CHAR (ptr, pat_end - ptr);
+ unsigned char *charstart = ptr - 1;
+
+ while (! (CHAR_HEAD_P (*charstart)))
+ charstart--;
+ ch = STRING_CHAR (charstart, ptr - charstart + 1);
if (char_base != (ch & ~0x3F))
ch = -1;
}
- j = *ptr;
+ if (ch >= 0400)
+ j = (ch & 0x3F) | 0200;
+ else
+ j = *ptr;
+
if (i == infinity)
stride_for_teases = BM_tab[j];
if (ch >= 0)
{
int starting_ch = ch;
- int starting_j;
+ int starting_j = j;
- if (ch > 0400)
- starting_j = (ch & ~0x3F) | 0200;
- else
- starting_j = ch;
while (1)
{
TRANSLATE (ch, inverse_trt, ch);
- if (ch > 0400)
- j = (ch & ~0x3F) | 0200;
+ if (ch >= 0400)
+ j = (ch & 0x3F) | 0200;
else
j = ch;
else
some_multiletter_word = 1;
}
- else if (!NOCASEP (c))
+ else if (UPPERCASEP (c))
{
some_uppercase = 1;
if (SYNTAX (prevc) != Sword)
for (; in != end; in++)
{
- if (*in == '[' || *in == ']'
+ if (*in == '['
|| *in == '*' || *in == '.' || *in == '\\'
|| *in == '?' || *in == '+'
|| *in == '^' || *in == '$')