X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/d02fe47dd3be7310d1bfd6e802d1fac2ea5f5e9d..95b1abcfafe8a366a75635f5fa4b4fa1e79f2964:/src/search.c diff --git a/src/search.c b/src/search.c index 96daecb728..2269afc6d8 100644 --- a/src/search.c +++ b/src/search.c @@ -1,14 +1,14 @@ /* String search routines for GNU Emacs. Copyright (C) 1985, 1986, 1987, 1993, 1994, 1997, 1998, 1999, 2001, 2002, - 2003, 2004, 2005, 2006, 2007, 2008 + 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. This file is part of GNU Emacs. -GNU Emacs is free software; you can redistribute it and/or modify +GNU Emacs is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. GNU Emacs is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -16,12 +16,11 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with GNU Emacs; see the file COPYING. If not, write to -the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, -Boston, MA 02110-1301, USA. */ +along with GNU Emacs. If not, see . */ #include +#include #include "lisp.h" #include "syntax.h" #include "category.h" @@ -99,11 +98,18 @@ Lisp_Object Vsearch_spaces_regexp; only. */ Lisp_Object Vinhibit_changing_match_data; -static void set_search_regs (); -static void save_search_regs (); -static int simple_search (); -static int boyer_moore (); -static int search_buffer (); +static void set_search_regs P_ ((EMACS_INT, EMACS_INT)); +static void save_search_regs P_ ((void)); +static EMACS_INT simple_search P_ ((int, unsigned char *, int, int, + Lisp_Object, EMACS_INT, EMACS_INT, + EMACS_INT, EMACS_INT)); +static EMACS_INT boyer_moore P_ ((int, unsigned char *, int, int, + Lisp_Object, Lisp_Object, + EMACS_INT, EMACS_INT, + EMACS_INT, EMACS_INT, int)); +static EMACS_INT search_buffer P_ ((Lisp_Object, EMACS_INT, EMACS_INT, + EMACS_INT, EMACS_INT, int, int, + Lisp_Object, Lisp_Object, int)); static void matcher_overflow () NO_RETURN; static void @@ -204,8 +210,8 @@ clear_regexp_cache () int i; for (i = 0; i < REGEXP_CACHE_SIZE; ++i) - /* It's tempting to compare with the syntax-table we've actually changd, - but it's not sufficient because char-table inheritance mewans that + /* It's tempting to compare with the syntax-table we've actually changed, + but it's not sufficient because char-table inheritance means that modifying one syntax-table can change others at the same time. */ if (!EQ (searchbufs[i].syntax_table, Qt)) searchbufs[i].regexp = Qnil; @@ -290,7 +296,7 @@ looking_at_1 (string, posix) { Lisp_Object val; unsigned char *p1, *p2; - int s1, s2; + EMACS_INT s1, s2; register int i; struct re_pattern_buffer *bufp; @@ -392,7 +398,7 @@ string_match_1 (regexp, string, start, posix) { int val; struct re_pattern_buffer *bufp; - int pos, pos_byte; + EMACS_INT pos, pos_byte; int i; if (running_asynch_code) @@ -558,6 +564,74 @@ fast_string_match_ignore_case (regexp, string) immediate_quit = 0; return val; } + +/* Match REGEXP against the characters after POS to LIMIT, and return + the number of matched characters. If STRING is non-nil, match + against the characters in it. In that case, POS and LIMIT are + indices into the string. This function doesn't modify the match + data. */ + +EMACS_INT +fast_looking_at (regexp, pos, pos_byte, limit, limit_byte, string) + Lisp_Object regexp; + EMACS_INT pos, pos_byte, limit, limit_byte; + Lisp_Object string; +{ + int multibyte; + struct re_pattern_buffer *buf; + unsigned char *p1, *p2; + EMACS_INT s1, s2; + EMACS_INT len; + + if (STRINGP (string)) + { + if (pos_byte < 0) + pos_byte = string_char_to_byte (string, pos); + if (limit_byte < 0) + limit_byte = string_char_to_byte (string, limit); + p1 = NULL; + s1 = 0; + p2 = SDATA (string); + s2 = SBYTES (string); + re_match_object = string; + multibyte = STRING_MULTIBYTE (string); + } + else + { + if (pos_byte < 0) + pos_byte = CHAR_TO_BYTE (pos); + if (limit_byte < 0) + limit_byte = CHAR_TO_BYTE (limit); + pos_byte -= BEGV_BYTE; + limit_byte -= BEGV_BYTE; + p1 = BEGV_ADDR; + s1 = GPT_BYTE - BEGV_BYTE; + p2 = GAP_END_ADDR; + s2 = ZV_BYTE - GPT_BYTE; + if (s1 < 0) + { + p2 = p1; + s2 = ZV_BYTE - BEGV_BYTE; + s1 = 0; + } + if (s2 < 0) + { + s1 = ZV_BYTE - BEGV_BYTE; + s2 = 0; + } + re_match_object = Qnil; + multibyte = ! NILP (current_buffer->enable_multibyte_characters); + } + + buf = compile_pattern (regexp, 0, Qnil, 0, multibyte); + immediate_quit = 1; + len = re_match_2 (buf, (char *) p1, s1, (char *) p2, s2, + pos_byte, NULL, limit_byte); + immediate_quit = 0; + + return len; +} + /* The newline cache: remembering which sections of text have no newlines. */ @@ -610,7 +684,7 @@ newline_cache_on_off (buf) int scan_buffer (target, start, end, count, shortage, allow_quit) register int target; - int start, end; + EMACS_INT start, end; int count; int *shortage; int allow_quit; @@ -645,9 +719,9 @@ scan_buffer (target, start, end, count, shortage, allow_quit) the position of the last character before the next such obstacle --- the last character the dumb search loop should examine. */ - int ceiling_byte = CHAR_TO_BYTE (end) - 1; - int start_byte = CHAR_TO_BYTE (start); - int tem; + EMACS_INT ceiling_byte = CHAR_TO_BYTE (end) - 1; + EMACS_INT start_byte = CHAR_TO_BYTE (start); + EMACS_INT tem; /* If we're looking for a newline, consult the newline cache to see where we can avoid some scanning. */ @@ -718,9 +792,9 @@ scan_buffer (target, start, end, count, shortage, allow_quit) while (start > end) { /* The last character to check before the next obstacle. */ - int ceiling_byte = CHAR_TO_BYTE (end); - int start_byte = CHAR_TO_BYTE (start); - int tem; + EMACS_INT ceiling_byte = CHAR_TO_BYTE (end); + EMACS_INT start_byte = CHAR_TO_BYTE (start); + EMACS_INT tem; /* Consult the newline cache, if appropriate. */ if (target == '\n' && newline_cache) @@ -806,8 +880,8 @@ scan_buffer (target, start, end, count, shortage, allow_quit) int scan_newline (start, start_byte, limit, limit_byte, count, allow_quit) - int start, start_byte; - int limit, limit_byte; + EMACS_INT start, start_byte; + EMACS_INT limit, limit_byte; register int count; int allow_quit; { @@ -816,7 +890,7 @@ scan_newline (start, start_byte, limit, limit_byte, count, allow_quit) register unsigned char *cursor; unsigned char *base; - register int ceiling; + EMACS_INT ceiling; register unsigned char *ceiling_addr; int old_immediate_quit = immediate_quit; @@ -904,7 +978,8 @@ scan_newline (start, start_byte, limit, limit_byte, count, allow_quit) int find_next_newline_no_quit (from, cnt) - register int from, cnt; + EMACS_INT from; + int cnt; { return scan_buffer ('\n', from, 0, cnt, (int *) 0, 0); } @@ -915,7 +990,8 @@ find_next_newline_no_quit (from, cnt) int find_before_next_newline (from, to, cnt) - int from, to, cnt; + EMACS_INT from, to; + int cnt; { int shortage; int pos = scan_buffer ('\n', from, to, cnt, &shortage, 1); @@ -1076,14 +1152,14 @@ while (0) (i.e. Vinhibit_changing_match_data is non-nil). */ static struct re_registers search_regs_1; -static int +static EMACS_INT search_buffer (string, pos, pos_byte, lim, lim_byte, n, RE, trt, inverse_trt, posix) Lisp_Object string; - int pos; - int pos_byte; - int lim; - int lim_byte; + EMACS_INT pos; + EMACS_INT pos_byte; + EMACS_INT lim; + EMACS_INT lim_byte; int n; int RE; Lisp_Object trt; @@ -1311,7 +1387,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, base_pat++; } - c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); + c = STRING_CHAR_AND_LENGTH (base_pat, in_charlen); if (NILP (trt)) { @@ -1342,11 +1418,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, if (this_char_base > 0) boyer_moore_ok = 0; else - { - this_char_base = 0; - if (char_base < 0) - char_base = this_char_base; - } + this_char_base = 0; } else if (CHAR_BYTE8_P (inverse)) /* Boyer-moore search can't handle a @@ -1358,8 +1430,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, this_char_base = inverse & ~0x3F; if (char_base < 0) char_base = this_char_base; - else if (char_base > 0 - && this_char_base != char_base) + else if (this_char_base != char_base) boyer_moore_ok = 0; } else if ((inverse & ~0x3F) != this_char_base) @@ -1370,8 +1441,6 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, } } } - if (char_base < 0) - char_base = 0; /* Store this character into the translated pattern. */ bcopy (str, pat, charlen); @@ -1379,6 +1448,11 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, base_pat += in_charlen; len_byte -= in_charlen; } + + /* If char_base is still negative we didn't find any translated + non-ASCII characters. */ + if (char_base < 0) + char_base = 0; } else { @@ -1429,14 +1503,14 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, regardless of what is in TRT. It is used in cases where boyer_moore cannot work. */ -static int +static EMACS_INT simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) int n; unsigned char *pat; int len, len_byte; Lisp_Object trt; - int pos, pos_byte; - int lim, lim_byte; + EMACS_INT pos, pos_byte; + EMACS_INT lim, lim_byte; { int multibyte = ! NILP (current_buffer->enable_multibyte_characters); int forward = n > 0; @@ -1450,10 +1524,9 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) while (1) { /* Try matching at position POS. */ - int this_pos = pos; - int this_pos_byte = pos_byte; + EMACS_INT this_pos = pos; + EMACS_INT this_pos_byte = pos_byte; int this_len = len; - int this_len_byte = len_byte; unsigned char *p = pat; if (pos + len > lim || pos_byte + len_byte > lim_byte) goto stop; @@ -1463,16 +1536,14 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) int charlen, buf_charlen; int pat_ch, buf_ch; - pat_ch = STRING_CHAR_AND_LENGTH (p, this_len_byte, charlen); + pat_ch = STRING_CHAR_AND_LENGTH (p, charlen); buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte), - ZV_BYTE - this_pos_byte, buf_charlen); TRANSLATE (buf_ch, trt, buf_ch); if (buf_ch != pat_ch) break; - this_len_byte -= charlen; this_len--; p += charlen; @@ -1499,7 +1570,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) while (1) { /* Try matching at position POS. */ - int this_pos = pos; + EMACS_INT this_pos = pos; int this_len = len; unsigned char *p = pat; @@ -1538,42 +1609,36 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) while (1) { /* Try matching at position POS. */ - int this_pos = pos - len; - int this_pos_byte; + EMACS_INT this_pos = pos; + EMACS_INT this_pos_byte = pos_byte; int this_len = len; - int this_len_byte = len_byte; - unsigned char *p = pat; + const unsigned char *p = pat + len_byte; - if (this_pos < lim || (pos_byte - len_byte) < lim_byte) + if (this_pos - len < lim || (pos_byte - len_byte) < lim_byte) goto stop; - this_pos_byte = CHAR_TO_BYTE (this_pos); - match_byte = pos_byte - this_pos_byte; while (this_len > 0) { - int charlen, buf_charlen; + int charlen; int pat_ch, buf_ch; - pat_ch = STRING_CHAR_AND_LENGTH (p, this_len_byte, charlen); - buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte), - ZV_BYTE - this_pos_byte, - buf_charlen); + DEC_BOTH (this_pos, this_pos_byte); + PREV_CHAR_BOUNDARY (p, pat); + pat_ch = STRING_CHAR (p); + buf_ch = STRING_CHAR (BYTE_POS_ADDR (this_pos_byte)); TRANSLATE (buf_ch, trt, buf_ch); if (buf_ch != pat_ch) break; - this_len_byte -= charlen; this_len--; - p += charlen; - this_pos_byte += buf_charlen; - this_pos++; } if (this_len == 0) { - pos -= len; - pos_byte -= match_byte; + match_byte = pos_byte - this_pos_byte; + pos = this_pos; + pos_byte = this_pos_byte; break; } @@ -1588,7 +1653,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) while (1) { /* Try matching at position POS. */ - int this_pos = pos - len; + EMACS_INT this_pos = pos - len; int this_len = len; unsigned char *p = pat; @@ -1651,7 +1716,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) If that criterion is not satisfied, do not call this function. */ -static int +static EMACS_INT boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, pos, pos_byte, lim, lim_byte, char_base) int n; @@ -1659,15 +1724,15 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, int len, len_byte; Lisp_Object trt; Lisp_Object inverse_trt; - int pos, pos_byte; - int lim, lim_byte; + EMACS_INT pos, pos_byte; + EMACS_INT lim, lim_byte; int char_base; { int direction = ((n > 0) ? 1 : -1); register int dirlen; - int infinity, limit, stride_for_teases = 0; - register int *BM_tab; - int *BM_tab_base; + EMACS_INT limit; + int stride_for_teases = 0; + int BM_tab[0400]; register unsigned char *cursor, *p_limit; register int i, j; unsigned char *pat, *pat_end; @@ -1683,37 +1748,28 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, int translate_prev_byte3 = 0; int translate_prev_byte4 = 0; - BM_tab = (int *) alloca (0400 * sizeof (int)); - - /* The general approach is that we are going to maintain that we know */ - /* the first (closest to the present position, in whatever direction */ - /* we're searching) character that could possibly be the last */ - /* (furthest from present position) character of a valid match. We */ - /* advance the state of our knowledge by looking at that character */ - /* and seeing whether it indeed matches the last character of the */ - /* pattern. If it does, we take a closer look. If it does not, we */ - /* move our pointer (to putative last characters) as far as is */ - /* logically possible. This amount of movement, which I call a */ - /* stride, will be the length of the pattern if the actual character */ - /* appears nowhere in the pattern, otherwise it will be the distance */ - /* from the last occurrence of that character to the end of the */ - /* pattern. */ - /* As a coding trick, an enormous stride is coded into the table for */ - /* characters that match the last character. This allows use of only */ - /* a single test, a test for having gone past the end of the */ - /* permissible match region, to test for both possible matches (when */ - /* the stride goes past the end immediately) and failure to */ - /* match (where you get nudged past the end one stride at a time). */ - - /* Here we make a "mickey mouse" BM table. The stride of the search */ - /* is determined only by the last character of the putative match. */ - /* If that character does not match, we will stride the proper */ - /* distance to propose a match that superimposes it on the last */ - /* instance of a character that matches it (per trt), or misses */ - /* it entirely if there is none. */ + /* The general approach is that we are going to maintain that we know + the first (closest to the present position, in whatever direction + we're searching) character that could possibly be the last + (furthest from present position) character of a valid match. We + advance the state of our knowledge by looking at that character + and seeing whether it indeed matches the last character of the + pattern. If it does, we take a closer look. If it does not, we + move our pointer (to putative last characters) as far as is + logically possible. This amount of movement, which I call a + stride, will be the length of the pattern if the actual character + appears nowhere in the pattern, otherwise it will be the distance + from the last occurrence of that character to the end of the + pattern. If the amount is zero we have a possible match. */ + + /* Here we make a "mickey mouse" BM table. The stride of the search + is determined only by the last character of the putative match. + If that character does not match, we will stride the proper + distance to propose a match that superimposes it on the last + instance of a character that matches it (per trt), or misses + it entirely if there is none. */ dirlen = len_byte * direction; - infinity = dirlen - (lim_byte + pos_byte + len_byte + len_byte) * direction; /* Record position after the end of the pattern. */ pat_end = base_pat + len_byte; @@ -1723,23 +1779,14 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, if (direction < 0) base_pat = pat_end - 1; - BM_tab_base = BM_tab; - BM_tab += 0400; - j = dirlen; /* to get it in a register */ - /* A character that does not appear in the pattern induces a */ - /* stride equal to the pattern length. */ - while (BM_tab_base != BM_tab) - { - *--BM_tab = j; - *--BM_tab = j; - *--BM_tab = j; - *--BM_tab = j; - } + /* A character that does not appear in the pattern induces a + stride equal to the pattern length. */ + for (i = 0; i < 0400; i++) + BM_tab[i] = dirlen; /* We use this for translation, instead of TRT itself. We fill this in to handle the characters that actually occur in the pattern. Others don't matter anyway! */ - bzero (simple_translate, sizeof simple_translate); for (i = 0; i < 0400; i++) simple_translate[i] = i; @@ -1764,12 +1811,10 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, } i = 0; - while (i != infinity) + while (i != dirlen) { unsigned char *ptr = base_pat + i; i += direction; - if (i == dirlen) - i = infinity; if (! NILP (trt)) { /* If the byte currently looking at is the last of a @@ -1787,7 +1832,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, while (! (CHAR_HEAD_P (*charstart))) charstart--; - ch = STRING_CHAR (charstart, ptr - charstart + 1); + ch = STRING_CHAR (charstart); if (char_base != (ch & ~0x3F)) ch = -1; } @@ -1797,7 +1842,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, else j = *ptr; - if (i == infinity) + if (i == dirlen) stride_for_teases = BM_tab[j]; BM_tab[j] = dirlen - i; @@ -1830,23 +1875,22 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, { j = *ptr; - if (i == infinity) + if (i == dirlen) stride_for_teases = BM_tab[j]; BM_tab[j] = dirlen - i; } - /* stride_for_teases tells how much to stride if we get a */ - /* match on the far character but are subsequently */ - /* disappointed, by recording what the stride would have been */ - /* for that character if the last character had been */ - /* different. */ + /* stride_for_teases tells how much to stride if we get a + match on the far character but are subsequently + disappointed, by recording what the stride would have been + for that character if the last character had been + different. */ } - infinity = dirlen - infinity; pos_byte += dirlen - ((direction > 0) ? direction : 0); /* loop invariant - POS_BYTE points at where last char (first char if reverse) of pattern would align in a possible match. */ while (n != 0) { - int tail_end; + EMACS_INT tail_end; unsigned char *tail_end_ptr; /* It's been reported that some (broken) compiler thinks that @@ -1884,43 +1928,34 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, p_limit = BYTE_POS_ADDR (limit); p2 = (cursor = BYTE_POS_ADDR (pos_byte)); - /* In this loop, pos + cursor - p2 is the surrogate for pos */ + /* In this loop, pos + cursor - p2 is the surrogate for pos. */ while (1) /* use one cursor setting as long as i can */ { if (direction > 0) /* worth duplicating */ { - /* Use signed comparison if appropriate - to make cursor+infinity sure to be > p_limit. - Assuming that the buffer lies in a range of addresses - that are all "positive" (as ints) or all "negative", - either kind of comparison will work as long - as we don't step by infinity. So pick the kind - that works when we do step by infinity. */ - if ((EMACS_INT) (p_limit + infinity) > (EMACS_INT) p_limit) - while ((EMACS_INT) cursor <= (EMACS_INT) p_limit) - cursor += BM_tab[*cursor]; - else - while ((EMACS_UINT) cursor <= (EMACS_UINT) p_limit) + while (cursor <= p_limit) + { + if (BM_tab[*cursor] == 0) + goto hit; cursor += BM_tab[*cursor]; + } } else { - if ((EMACS_INT) (p_limit + infinity) < (EMACS_INT) p_limit) - while ((EMACS_INT) cursor >= (EMACS_INT) p_limit) - cursor += BM_tab[*cursor]; - else - while ((EMACS_UINT) cursor >= (EMACS_UINT) p_limit) + while (cursor >= p_limit) + { + if (BM_tab[*cursor] == 0) + goto hit; cursor += BM_tab[*cursor]; + } } -/* If you are here, cursor is beyond the end of the searched region. */ -/* This can happen if you match on the far character of the pattern, */ -/* because the "stride" of that character is infinity, a number able */ -/* to throw you well beyond the end of the search. It can also */ -/* happen if you fail to match within the permitted region and would */ -/* otherwise try a character beyond that region */ - if ((cursor - p_limit) * direction <= len_byte) - break; /* a small overrun is genuine */ - cursor -= infinity; /* large overrun = hit */ + /* If you are here, cursor is beyond the end of the + searched region. You fail to match within the + permitted region and would otherwise try a character + beyond that region. */ + break; + + hit: i = dirlen - direction; if (! NILP (trt)) { @@ -1959,7 +1994,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, cursor += dirlen - i - direction; /* fix cursor */ if (i + direction == 0) { - int position, start, end; + EMACS_INT position, start, end; cursor -= direction; @@ -1992,8 +2027,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, pos_byte += cursor - p2; } else - /* Now we'll pick up a clump that has to be done the hard */ - /* way because it covers a discontinuity */ + /* Now we'll pick up a clump that has to be done the hard + way because it covers a discontinuity. */ { limit = ((direction > 0) ? BUFFER_CEILING_OF (pos_byte - dirlen + 1) @@ -2005,19 +2040,21 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, and still be valid for a possible match. */ while (1) { - /* This loop can be coded for space rather than */ - /* speed because it will usually run only once. */ - /* (the reach is at most len + 21, and typically */ - /* does not exceed len) */ + /* This loop can be coded for space rather than + speed because it will usually run only once. + (the reach is at most len + 21, and typically + does not exceed len). */ while ((limit - pos_byte) * direction >= 0) - pos_byte += BM_tab[FETCH_BYTE (pos_byte)]; - /* now run the same tests to distinguish going off the */ - /* end, a match or a phony match. */ - if ((pos_byte - limit) * direction <= len_byte) - break; /* ran off the end */ - /* Found what might be a match. - Set POS_BYTE back to last (first if reverse) pos. */ - pos_byte -= infinity; + { + int ch = FETCH_BYTE (pos_byte); + if (BM_tab[ch] == 0) + goto hit2; + pos_byte += BM_tab[ch]; + } + break; /* ran off the end */ + + hit2: + /* Found what might be a match. */ i = dirlen - direction; while ((i -= direction) + direction != 0) { @@ -2046,10 +2083,10 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, /* Above loop has moved POS_BYTE part or all the way back to the first pos (last pos if reverse). Set it once again at the last (first if reverse) char. */ - pos_byte += dirlen - i- direction; + pos_byte += dirlen - i - direction; if (i + direction == 0) { - int position, start, end; + EMACS_INT position, start, end; pos_byte -= direction; position = pos_byte + ((direction > 0) ? 1 - len_byte : 0); @@ -2091,7 +2128,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, static void set_search_regs (beg_byte, nbytes) - int beg_byte, nbytes; + EMACS_INT beg_byte, nbytes; { int i; @@ -2119,19 +2156,21 @@ set_search_regs (beg_byte, nbytes) XSETBUFFER (last_thing_searched, current_buffer); } -/* Given a string of words separated by word delimiters, - compute a regexp that matches those exact words - separated by arbitrary punctuation. */ +/* Given STRING, a string of words separated by word delimiters, + compute a regexp that matches those exact words separated by + arbitrary punctuation. If LAX is nonzero, the end of the string + need not match a word boundary unless it ends in whitespace. */ static Lisp_Object -wordify (string) +wordify (string, lax) Lisp_Object string; + int lax; { register unsigned char *p, *o; register int i, i_byte, len, punct_count = 0, word_count = 0; Lisp_Object val; int prev_c = 0; - int adjust; + int adjust, whitespace_at_end; CHECK_STRING (string); p = SDATA (string); @@ -2154,11 +2193,18 @@ wordify (string) } if (SYNTAX (prev_c) == Sword) - word_count++; + { + word_count++; + whitespace_at_end = 0; + } + else + whitespace_at_end = 1; + if (!word_count) return empty_unibyte_string; - adjust = - punct_count + 5 * (word_count - 1) + 4; + adjust = - punct_count + 5 * (word_count - 1) + + ((lax && !whitespace_at_end) ? 2 : 4); if (STRING_MULTIBYTE (string)) val = make_uninit_multibyte_string (len + adjust, SBYTES (string) @@ -2196,8 +2242,11 @@ wordify (string) prev_c = c; } - *o++ = '\\'; - *o++ = 'b'; + if (!lax || whitespace_at_end) + { + *o++ = '\\'; + *o++ = 'b'; + } return val; } @@ -2254,7 +2303,7 @@ Optional fourth argument is repeat count--search for successive occurrences. */ (string, bound, noerror, count) Lisp_Object string, bound, noerror, count; { - return search_command (wordify (string), bound, noerror, count, -1, 1, 0); + return search_command (wordify (string, 0), bound, noerror, count, -1, 1, 0); } DEFUN ("word-search-forward", Fword_search_forward, Sword_search_forward, 1, 4, @@ -2269,7 +2318,45 @@ Optional fourth argument is repeat count--search for successive occurrences. */ (string, bound, noerror, count) Lisp_Object string, bound, noerror, count; { - return search_command (wordify (string), bound, noerror, count, 1, 1, 0); + return search_command (wordify (string, 0), bound, noerror, count, 1, 1, 0); +} + +DEFUN ("word-search-backward-lax", Fword_search_backward_lax, Sword_search_backward_lax, 1, 4, + "sWord search backward: ", + doc: /* Search backward from point for STRING, ignoring differences in punctuation. +Set point to the beginning of the occurrence found, and return point. + +Unlike `word-search-backward', the end of STRING need not match a word +boundary unless it ends in whitespace. + +An optional second argument bounds the search; it is a buffer position. +The match found must not extend before that position. +Optional third argument, if t, means if fail just return nil (no error). + If not nil and not t, move to limit of search and return nil. +Optional fourth argument is repeat count--search for successive occurrences. */) + (string, bound, noerror, count) + Lisp_Object string, bound, noerror, count; +{ + return search_command (wordify (string, 1), bound, noerror, count, -1, 1, 0); +} + +DEFUN ("word-search-forward-lax", Fword_search_forward_lax, Sword_search_forward_lax, 1, 4, + "sWord search: ", + doc: /* Search forward from point for STRING, ignoring differences in punctuation. +Set point to the end of the occurrence found, and return point. + +Unlike `word-search-forward', the end of STRING need not match a word +boundary unless it ends in whitespace. + +An optional second argument bounds the search; it is a buffer position. +The match found must not extend after that position. +Optional third argument, if t, means if fail just return nil (no error). + If not nil and not t, move to limit of search and return nil. +Optional fourth argument is repeat count--search for successive occurrences. */) + (string, bound, noerror, count) + Lisp_Object string, bound, noerror, count; +{ + return search_command (wordify (string, 1), bound, noerror, count, 1, 1, 0); } DEFUN ("re-search-backward", Fre_search_backward, Sre_search_backward, 1, 4, @@ -2393,7 +2480,7 @@ since only regular expressions have distinguished subexpressions. */) int some_nonuppercase_initial; register int c, prevc; int sub; - int opoint, newpoint; + EMACS_INT opoint, newpoint; CHECK_STRING (newtext); @@ -2436,7 +2523,7 @@ since only regular expressions have distinguished subexpressions. */) if (NILP (fixedcase)) { /* Decide how to casify by examining the matched text. */ - int last; + EMACS_INT last; pos = search_regs.start[sub]; last = search_regs.end[sub]; @@ -2523,8 +2610,8 @@ since only regular expressions have distinguished subexpressions. */) if desired. */ if (NILP (literal)) { - int lastpos = 0; - int lastpos_byte = 0; + EMACS_INT lastpos = 0; + EMACS_INT lastpos_byte = 0; /* We build up the substituted string in ACCUM. */ Lisp_Object accum; Lisp_Object middle; @@ -2664,7 +2751,7 @@ since only regular expressions have distinguished subexpressions. */) /* Note that we don't have to increment POS. */ c = SREF (newtext, pos_byte++); if (buf_multibyte) - c = unibyte_char_to_multibyte (c); + MAKE_CHAR_MULTIBYTE (c); } /* Either set ADD_STUFF and ADD_LEN to the text to put in SUBSTED, @@ -2686,7 +2773,7 @@ since only regular expressions have distinguished subexpressions. */) { c = SREF (newtext, pos_byte++); if (buf_multibyte) - c = unibyte_char_to_multibyte (c); + MAKE_CHAR_MULTIBYTE (c); } if (c == '&') @@ -2714,7 +2801,7 @@ since only regular expressions have distinguished subexpressions. */) set up ADD_STUFF and ADD_LEN to point to it. */ if (idx >= 0) { - int begbyte = CHAR_TO_BYTE (search_regs.start[idx]); + EMACS_INT begbyte = CHAR_TO_BYTE (search_regs.start[idx]); add_len = CHAR_TO_BYTE (search_regs.end[idx]) - begbyte; if (search_regs.start[idx] < GPT && GPT < search_regs.end[idx]) move_gap (search_regs.start[idx]); @@ -2768,9 +2855,9 @@ since only regular expressions have distinguished subexpressions. */) /* Adjust search data for this change. */ { - int oldend = search_regs.end[sub]; - int oldstart = search_regs.start[sub]; - int change = newpoint - search_regs.end[sub]; + EMACS_INT oldend = search_regs.end[sub]; + EMACS_INT oldstart = search_regs.start[sub]; + EMACS_INT change = newpoint - search_regs.end[sub]; int i; for (i = 0; i < search_regs.num_regs; i++) @@ -2849,7 +2936,7 @@ DEFUN ("match-data", Fmatch_data, Smatch_data, 0, 3, 0, Element 2N is `(match-beginning N)'; element 2N + 1 is `(match-end N)'. All the elements are markers or nil (nil if the Nth pair didn't match) if the last match was on a buffer; integers or nil if a string was matched. -Use `store-match-data' to reinstate the data in this list. +Use `set-match-data' to reinstate the data in this list. If INTEGERS (the optional first argument) is non-nil, always use integers \(rather than markers) to represent buffer positions. In @@ -3026,7 +3113,7 @@ If optional arg RESEAT is non-nil, make markers on LIST point nowhere. */) } else { - int from; + EMACS_INT from; Lisp_Object m; m = marker; @@ -3194,20 +3281,20 @@ syms_of_search () } searchbuf_head = &searchbufs[0]; - Qsearch_failed = intern ("search-failed"); + Qsearch_failed = intern_c_string ("search-failed"); staticpro (&Qsearch_failed); - Qinvalid_regexp = intern ("invalid-regexp"); + Qinvalid_regexp = intern_c_string ("invalid-regexp"); staticpro (&Qinvalid_regexp); Fput (Qsearch_failed, Qerror_conditions, - Fcons (Qsearch_failed, Fcons (Qerror, Qnil))); + pure_cons (Qsearch_failed, pure_cons (Qerror, Qnil))); Fput (Qsearch_failed, Qerror_message, - build_string ("Search failed")); + make_pure_c_string ("Search failed")); Fput (Qinvalid_regexp, Qerror_conditions, - Fcons (Qinvalid_regexp, Fcons (Qerror, Qnil))); + pure_cons (Qinvalid_regexp, pure_cons (Qerror, Qnil))); Fput (Qinvalid_regexp, Qerror_message, - build_string ("Invalid regexp")); + make_pure_c_string ("Invalid regexp")); last_thing_searched = Qnil; staticpro (&last_thing_searched); @@ -3239,6 +3326,8 @@ is to bind it with `let' around a small expression. */); defsubr (&Ssearch_backward); defsubr (&Sword_search_forward); defsubr (&Sword_search_backward); + defsubr (&Sword_search_forward_lax); + defsubr (&Sword_search_backward_lax); defsubr (&Sre_search_forward); defsubr (&Sre_search_backward); defsubr (&Sposix_search_forward);