/* String search routines for GNU Emacs.
- Copyright (C) 1985, 86,87,93,94,97,98, 1999, 2004
- Free Software Foundation, Inc.
+ Copyright (C) 1985, 1986, 1987, 1993, 1994, 1997, 1998, 1999, 2002, 2003,
+ 2004, 2005 Free Software Foundation, Inc.
This file is part of GNU Emacs.
You should have received a copy of the GNU General Public License
along with GNU Emacs; see the file COPYING. If not, write to
-the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA. */
+the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA. */
#include <config.h>
DEFUN ("string-match", Fstring_match, Sstring_match, 2, 3, 0,
doc: /* Return index of start of first match for REGEXP in STRING, or nil.
-Case is ignored if `case-fold-search' is non-nil in the current buffer.
+Matching ignores case if `case-fold-search' is non-nil.
If third arg START is non-nil, start search at that index in STRING.
For index of first char beyond the match, do (match-end 0).
`match-end' and `match-beginning' also give indices of substrings
int raw_pattern_size_byte;
unsigned char *patbuf;
int multibyte = !NILP (current_buffer->enable_multibyte_characters);
- unsigned char *base_pat = SDATA (string);
- /* High bits of char; 0 for ASCII characters, (CHAR & ~0x3F)
- otherwise. Characters of the same high bits have the same
- sequence of bytes but last. To do the BM search, all
- characters in STRING must have the same high bits (including
- their case translations). */
- int char_high_bits = -1;
+ unsigned char *base_pat;
+ /* Set to positive if we find a non-ASCII char that need
+ translation. Otherwise set to zero later. */
+ int char_base = -1;
int boyer_moore_ok = 1;
/* MULTIBYTE says whether the text to be searched is multibyte.
base_pat = raw_pattern;
if (multibyte)
{
+ /* Fill patbuf by translated characters in STRING while
+ checking if we can use boyer-moore search. If TRT is
+ non-nil, we can use boyer-moore search only if TRT can be
+ represented by the byte array of 256 elements. For that,
+ all non-ASCII case-equivalents of all case-senstive
+ characters in STRING must belong to the same charset and
+ row. */
+
while (--len >= 0)
{
+ unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
int c, translated, inverse;
- int in_charlen;
+ int in_charlen, charlen;
/* If we got here and the RE flag is set, it's because we're
dealing with a regexp known to be trivial, so the backslash
c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
- /* Translate the character, if requested. */
- TRANSLATE (translated, trt, c);
- TRANSLATE (inverse, inverse_trt, c);
-
- /* Did this char actually get translated?
- Would any other char get translated into it? */
- if (translated != c || inverse != c)
+ if (NILP (trt))
{
- /* Keep track of which character set row
- contains the characters that need translation. */
- int this_high_bit = ASCII_CHAR_P (c) ? 0 : (c & ~0x3F);
- int c1 = inverse != c ? inverse : translated;
- int trt_high_bit = ASCII_CHAR_P (c1) ? 0 : (c1 & ~0x3F);
-
- if (this_high_bit != trt_high_bit)
- boyer_moore_ok = 0;
- else if (char_high_bits == -1)
- char_high_bits = this_high_bit;
- else if (char_high_bits != this_high_bit)
- /* If two different rows appear, needing translation,
- then we cannot use boyer_moore search. */
- boyer_moore_ok = 0;
+ str = base_pat;
+ charlen = in_charlen;
}
+ else
+ {
+ /* Translate the character. */
+ TRANSLATE (translated, trt, c);
+ charlen = CHAR_STRING (translated, str_base);
+ str = str_base;
+
+ /* Check if C has any other case-equivalents. */
+ TRANSLATE (inverse, inverse_trt, c);
+ /* If so, check if we can use boyer-moore. */
+ if (c != inverse && boyer_moore_ok)
+ {
+ /* Check if all equivalents belong to the same
+ group of characters. Note that the check of C
+ itself is done by the last iteration. */
+ int this_char_base = -1;
+
+ while (boyer_moore_ok)
+ {
+ if (ASCII_BYTE_P (inverse))
+ {
+ if (this_char_base > 0)
+ boyer_moore_ok = 0;
+ else
+ {
+ this_char_base = 0;
+ if (char_base < 0)
+ char_base = this_char_base;
+ }
+ }
+ else if (CHAR_BYTE8_P (inverse))
+ /* Boyer-moore search can't handle a
+ translation of an eight-bit
+ character. */
+ boyer_moore_ok = 0;
+ else if (this_char_base < 0)
+ {
+ this_char_base = inverse & ~0x3F;
+ if (char_base < 0)
+ char_base = this_char_base;
+ else if (char_base > 0
+ && this_char_base != char_base)
+ boyer_moore_ok = 0;
+ }
+ else if ((inverse & ~0x3F) != this_char_base)
+ boyer_moore_ok = 0;
+ if (c == inverse)
+ break;
+ TRANSLATE (inverse, inverse_trt, inverse);
+ }
+ }
+ }
+ if (char_base < 0)
+ char_base = 0;
/* Store this character into the translated pattern. */
- CHAR_STRING_ADVANCE (translated, pat);
+ bcopy (str, pat, charlen);
+ pat += charlen;
base_pat += in_charlen;
len_byte -= in_charlen;
}
else
{
/* Unibyte buffer. */
- char_high_bits = 0;
+ char_base = 0;
while (--len >= 0)
{
int c, translated;
if (boyer_moore_ok)
return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
pos, pos_byte, lim, lim_byte,
- char_high_bits);
+ char_base);
else
return simple_search (n, pat, len, len_byte, trt,
pos, pos_byte, lim, lim_byte);
{
int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
int forward = n > 0;
+ /* Number of buffer bytes matched. Note that this may be different
+ from len_byte in a multibyte buffer. */
+ int match_byte;
if (lim > pos && multibyte)
while (n > 0)
if (this_len == 0)
{
+ match_byte = this_pos_byte - pos_byte;
pos += len;
- pos_byte += len_byte;
+ pos_byte += match_byte;
break;
}
if (this_len == 0)
{
+ match_byte = len;
pos += len;
break;
}
if (pos - len < lim)
goto stop;
this_pos_byte = CHAR_TO_BYTE (this_pos);
+ match_byte = pos_byte - this_pos_byte;
while (this_len > 0)
{
if (this_len == 0)
{
pos -= len;
- pos_byte -= len_byte;
+ pos_byte -= match_byte;
break;
}
if (this_len == 0)
{
+ match_byte = len;
pos -= len;
break;
}
if (n == 0)
{
if (forward)
- set_search_regs ((multibyte ? pos_byte : pos) - len_byte, len_byte);
+ set_search_regs ((multibyte ? pos_byte : pos) - match_byte, match_byte);
else
- set_search_regs (multibyte ? pos_byte : pos, len_byte);
+ set_search_regs (multibyte ? pos_byte : pos, match_byte);
return pos;
}
have nontrivial translation are the same aside from the last byte.
This makes it possible to translate just the last byte of a
character, and do so after just a simple test of the context.
- CHARSET_BASE is nonzero iff there is such a non-ASCII character.
+ CHAR_BASE is nonzero iff there is such a non-ASCII character.
If that criterion is not satisfied, do not call this function. */
static int
boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
- pos, pos_byte, lim, lim_byte, char_high_bits)
+ pos, pos_byte, lim, lim_byte, char_base)
int n;
unsigned char *base_pat;
int len, len_byte;
Lisp_Object inverse_trt;
int pos, pos_byte;
int lim, lim_byte;
- int char_high_bits;
+ int char_base;
{
int direction = ((n > 0) ? 1 : -1);
register int dirlen;
unsigned char simple_translate[0400];
/* These are set to the preceding bytes of a byte to be translated
- if charset_base is nonzero. As the maximum byte length of a
- multibyte character is 4, we have to check at most three previous
+ if char_base is nonzero. As the maximum byte length of a
+ multibyte character is 5, we have to check at most four previous
bytes. */
int translate_prev_byte1 = 0;
int translate_prev_byte2 = 0;
int translate_prev_byte3 = 0;
+ int translate_prev_byte4 = 0;
#ifdef C_ALLOCA
int BM_tab_space[0400];
for (i = 0; i < 0400; i++)
simple_translate[i] = i;
- if (charset_base)
+ if (char_base)
{
- /* Setup translate_prev_byte1/2/3 from CHARSET_BASE. Only a
+ /* Setup translate_prev_byte1/2/3/4 from CHAR_BASE. Only a
byte following them are the target of translation. */
- int sample_char = charset_base | 0x20;
unsigned char str[MAX_MULTIBYTE_LENGTH];
- int len = CHAR_STRING (sample_char, str);
+ int len = CHAR_STRING (char_base, str);
translate_prev_byte1 = str[len - 2];
if (len > 2)
{
translate_prev_byte2 = str[len - 3];
if (len > 3)
- translate_prev_byte3 = str[len - 4];
+ {
+ translate_prev_byte3 = str[len - 4];
+ if (len > 4)
+ translate_prev_byte4 = str[len - 5];
+ }
}
}
i = infinity;
if (! NILP (trt))
{
- int ch;
- int untranslated;
- int this_translated = 1;
-
- if (multibyte
- /* Is *PTR the last byte of a character? */
- && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1])))
+ /* If the byte currently looking at is the last of a
+ character to check case-equivalents, set CH to that
+ character. An ASCII character and a non-ASCII character
+ matching with CHAR_BASE are to be checked. */
+ int ch = -1;
+
+ if (ASCII_BYTE_P (*ptr) || ! multibyte)
+ ch = *ptr;
+ else if (char_base
+ && ((pat_end - ptr) == 1 || CHAR_HEAD_P (ptr[1])))
{
- unsigned char *charstart = ptr;
- while (! CHAR_HEAD_P (*charstart))
+ unsigned char *charstart = ptr - 1;
+
+ while (! (CHAR_HEAD_P (*charstart)))
charstart--;
- untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
- if (char_high_bits
- == (ASCII_CHAR_P (untranslated) ? 0 : untranslated & ~0x3F))
- {
- TRANSLATE (ch, trt, untranslated);
- if (! CHAR_HEAD_P (*ptr))
- {
- translate_prev_byte = ptr[-1];
- if (! CHAR_HEAD_P (translate_prev_byte))
- translate_anteprev_byte = ptr[-2];
- }
- }
- else
- {
- this_translated = 0;
- ch = *ptr;
- }
- }
- else if (!multibyte)
- TRANSLATE (ch, trt, *ptr);
- else
- {
- ch = *ptr;
- this_translated = 0;
+ ch = STRING_CHAR (charstart, ptr - charstart + 1);
+ if (char_base != (ch & ~0x3F))
+ ch = -1;
}
- if (this_translated
- && ch >= 0200)
+ if (ch > 0400)
j = (ch & 0x3F) | 0200;
else
- j = (unsigned char) ch;
+ j = *ptr;
if (i == infinity)
stride_for_teases = BM_tab[j];
BM_tab[j] = dirlen - i;
/* A translation table is accompanied by its inverse -- see */
/* comment following downcase_table for details */
- if (this_translated)
+ if (ch >= 0)
{
int starting_ch = ch;
int starting_j = j;
+
while (1)
{
TRANSLATE (ch, inverse_trt, ch);
- if (ch > 0200)
+ if (ch > 0400)
j = (ch & 0x3F) | 0200;
else
- j = (unsigned char) ch;
+ j = ch;
/* For all the characters that map into CH,
set up simple_translate to map the last byte
/* but some C compilers blew it */
if (search_regs.num_regs <= 0)
- error ("replace-match called before any match found");
+ error ("`replace-match' called before any match found");
if (NILP (subexp))
sub = 0;
else
some_multiletter_word = 1;
}
- else if (!NOCASEP (c))
+ else if (UPPERCASEP (c))
{
some_uppercase = 1;
if (SYNTAX (prevc) != Sword)
return match_limit (subexp, 0);
}
-DEFUN ("match-data", Fmatch_data, Smatch_data, 0, 2, 0,
+DEFUN ("match-data", Fmatch_data, Smatch_data, 0, 3, 0,
doc: /* Return a list containing all info on what the last search matched.
Element 2N is `(match-beginning N)'; element 2N + 1 is `(match-end N)'.
All the elements are markers or nil (nil if the Nth pair didn't match)
this case, and if the last match was in a buffer, the buffer will get
stored as one additional element at the end of the list.
-If REUSE is a list, reuse it as part of the value. If REUSE is long enough
-to hold all the values, and if INTEGERS is non-nil, no consing is done.
+If REUSE is a list, reuse it as part of the value. If REUSE is long
+enough to hold all the values, and if INTEGERS is non-nil, no consing
+is done.
+
+If optional third arg RESEAT is non-nil, any previous markers on the
+REUSE list will be modified to point to nowhere.
Return value is undefined if the last search failed. */)
- (integers, reuse)
- Lisp_Object integers, reuse;
+ (integers, reuse, reseat)
+ Lisp_Object integers, reuse, reseat;
{
Lisp_Object tail, prev;
Lisp_Object *data;
int i, len;
+ if (!NILP (reseat))
+ for (tail = reuse; CONSP (tail); tail = XCDR (tail))
+ if (MARKERP (XCAR (tail)))
+ {
+ unchain_marker (XMARKER (XCAR (tail)));
+ XSETCAR (tail, Qnil);
+ }
+
if (NILP (last_thing_searched))
return Qnil;
/* last_thing_searched must always be Qt, a buffer, or Qnil. */
abort ();
- len = 2*(i+1);
+ len = 2 * i + 2;
}
else
- data[2 * i] = data [2 * i + 1] = Qnil;
+ data[2 * i] = data[2 * i + 1] = Qnil;
}
if (BUFFERP (last_thing_searched) && !NILP (integers))
return reuse;
}
+/* Internal usage only:
+ If RESEAT is `evaporate', put the markers back on the free list
+ immediately. No other references to the markers must exist in this case,
+ so it is used only internally on the unwind stack and save-match-data from
+ Lisp. */
-DEFUN ("set-match-data", Fset_match_data, Sset_match_data, 1, 1, 0,
+DEFUN ("set-match-data", Fset_match_data, Sset_match_data, 1, 2, 0,
doc: /* Set internal data on last search match from elements of LIST.
-LIST should have been created by calling `match-data' previously. */)
- (list)
- register Lisp_Object list;
+LIST should have been created by calling `match-data' previously.
+
+If optional arg RESEAT is non-nil, make markers on LIST point nowhere. */)
+ (list, reseat)
+ register Lisp_Object list, reseat;
{
register int i;
register Lisp_Object marker;
search_regs.num_regs = length;
}
- for (i = 0;; i++)
+ for (i = 0; CONSP (list); i++)
{
- marker = Fcar (list);
+ marker = XCAR (list);
if (BUFFERP (marker))
{
last_thing_searched = marker;
if (NILP (marker))
{
search_regs.start[i] = -1;
- list = Fcdr (list);
+ list = XCDR (list);
}
else
{
int from;
+ Lisp_Object m;
+ m = marker;
if (MARKERP (marker))
{
if (XMARKER (marker)->buffer == 0)
CHECK_NUMBER_COERCE_MARKER (marker);
from = XINT (marker);
- list = Fcdr (list);
- marker = Fcar (list);
+ if (!NILP (reseat) && MARKERP (m))
+ {
+ if (EQ (reseat, Qevaporate))
+ free_marker (m);
+ else
+ unchain_marker (XMARKER (m));
+ XSETCAR (list, Qnil);
+ }
+
+ if ((list = XCDR (list), !CONSP (list)))
+ break;
+
+ m = marker = XCAR (list);
+
if (MARKERP (marker) && XMARKER (marker)->buffer == 0)
XSETFASTINT (marker, 0);
CHECK_NUMBER_COERCE_MARKER (marker);
search_regs.start[i] = from;
search_regs.end[i] = XINT (marker);
+
+ if (!NILP (reseat) && MARKERP (m))
+ {
+ if (EQ (reseat, Qevaporate))
+ free_marker (m);
+ else
+ unchain_marker (XMARKER (m));
+ XSETCAR (list, Qnil);
+ }
}
- list = Fcdr (list);
+ list = XCDR (list);
}
for (; i < search_regs.num_regs; i++)
/* Called upon exit from filters and sentinels. */
void
-restore_match_data ()
+restore_search_regs ()
{
if (search_regs_saved)
{
}
}
+static Lisp_Object
+unwind_set_match_data (list)
+ Lisp_Object list;
+{
+ /* It is safe to free (evaporate) the markers immediately. */
+ return Fset_match_data (list, Qevaporate);
+}
+
+/* Called to unwind protect the match data. */
+void
+record_unwind_save_match_data ()
+{
+ record_unwind_protect (unwind_set_match_data,
+ Fmatch_data (Qnil, Qnil, Qnil));
+}
+
/* Quote a string to inactivate reg-expr chars */
DEFUN ("regexp-quote", Fregexp_quote, Sregexp_quote, 1, 1, 0,
searchbufs[i].regexp = Qnil;
searchbufs[i].whitespace_regexp = Qnil;
staticpro (&searchbufs[i].regexp);
+ staticpro (&searchbufs[i].whitespace_regexp);
searchbufs[i].next = (i == REGEXP_CACHE_SIZE-1 ? 0 : &searchbufs[i+1]);
}
searchbuf_head = &searchbufs[0];