X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/39d0bf746693ddc92c3102f6602dfe8a9cc6db9d..972ed2462a24213f68acda61d43e60f3ad6502b9:/src/search.c diff --git a/src/search.c b/src/search.c index 3c91d3cce9..244220b92f 100644 --- a/src/search.c +++ b/src/search.c @@ -1,13 +1,14 @@ /* String search routines for GNU Emacs. Copyright (C) 1985, 1986, 1987, 1993, 1994, 1997, 1998, 1999, 2001, 2002, - 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. + 2003, 2004, 2005, 2006, 2007, 2008 + Free Software Foundation, Inc. This file is part of GNU Emacs. -GNU Emacs is free software; you can redistribute it and/or modify +GNU Emacs is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 3, or (at your option) -any later version. +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. GNU Emacs is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -15,9 +16,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with GNU Emacs; see the file COPYING. If not, write to -the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, -Boston, MA 02110-1301, USA. */ +along with GNU Emacs. If not, see . */ #include @@ -25,6 +24,7 @@ Boston, MA 02110-1301, USA. */ #include "syntax.h" #include "category.h" #include "buffer.h" +#include "character.h" #include "charset.h" #include "region-cache.h" #include "commands.h" @@ -120,62 +120,30 @@ matcher_overflow () subexpression bounds. POSIX is nonzero if we want full backtracking (POSIX style) for this pattern. 0 means backtrack only enough to get a valid match. - MULTIBYTE is nonzero if we want to handle multibyte characters in - PATTERN. 0 means all multibyte characters are recognized just as - sequences of binary data. The behavior also depends on Vsearch_spaces_regexp. */ static void -compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte) +compile_pattern_1 (cp, pattern, translate, regp, posix) struct regexp_cache *cp; Lisp_Object pattern; Lisp_Object translate; struct re_registers *regp; int posix; - int multibyte; { - unsigned char *raw_pattern; - int raw_pattern_size; char *val; reg_syntax_t old; - /* MULTIBYTE says whether the text to be searched is multibyte. - We must convert PATTERN to match that, or we will not really - find things right. */ - - if (multibyte == STRING_MULTIBYTE (pattern)) - { - raw_pattern = (unsigned char *) SDATA (pattern); - raw_pattern_size = SBYTES (pattern); - } - else if (multibyte) - { - raw_pattern_size = count_size_as_multibyte (SDATA (pattern), - SCHARS (pattern)); - raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1); - copy_text (SDATA (pattern), raw_pattern, - SCHARS (pattern), 0, 1); - } - else - { - /* Converting multibyte to single-byte. - - ??? Perhaps this conversion should be done in a special way - by subtracting nonascii-insert-offset from each non-ASCII char, - so that only the multibyte chars which really correspond to - the chosen single-byte character set can possibly match. */ - raw_pattern_size = SCHARS (pattern); - raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1); - copy_text (SDATA (pattern), raw_pattern, - SBYTES (pattern), 1, 0); - } - cp->regexp = Qnil; cp->buf.translate = (! NILP (translate) ? translate : make_number (0)); cp->posix = posix; - cp->buf.multibyte = multibyte; - cp->whitespace_regexp = Vsearch_spaces_regexp; + cp->buf.multibyte = STRING_MULTIBYTE (pattern); + cp->buf.charset_unibyte = charset_unibyte; + if (STRINGP (Vsearch_spaces_regexp)) + cp->whitespace_regexp = Vsearch_spaces_regexp; + else + cp->whitespace_regexp = Qnil; + /* rms: I think BLOCK_INPUT is not needed here any more, because regex.c defines malloc to call xmalloc. Using BLOCK_INPUT here means the debugger won't run if an error occurs. @@ -184,11 +152,13 @@ compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte) old = re_set_syntax (RE_SYNTAX_EMACS | (posix ? 0 : RE_NO_POSIX_BACKTRACKING)); - re_set_whitespace_regexp (NILP (Vsearch_spaces_regexp) ? NULL - : SDATA (Vsearch_spaces_regexp)); + if (STRINGP (Vsearch_spaces_regexp)) + re_set_whitespace_regexp (SDATA (Vsearch_spaces_regexp)); + else + re_set_whitespace_regexp (NULL); - val = (char *) re_compile_pattern ((char *)raw_pattern, - raw_pattern_size, &cp->buf); + val = (char *) re_compile_pattern ((char *) SDATA (pattern), + SBYTES (pattern), &cp->buf); /* If the compiled pattern hard codes some of the contents of the syntax-table, it can only be reused with *this* syntax table. */ @@ -274,10 +244,10 @@ compile_pattern (pattern, regp, translate, posix, multibyte) && !NILP (Fstring_equal (cp->regexp, pattern)) && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0))) && cp->posix == posix - && cp->buf.multibyte == multibyte && (EQ (cp->syntax_table, Qt) || EQ (cp->syntax_table, current_buffer->syntax_table)) - && !NILP (Fequal (cp->whitespace_regexp, Vsearch_spaces_regexp))) + && !NILP (Fequal (cp->whitespace_regexp, Vsearch_spaces_regexp)) + && cp->buf.charset_unibyte == charset_unibyte) break; /* If we're at the end of the cache, compile into the nil cell @@ -286,7 +256,7 @@ compile_pattern (pattern, regp, translate, posix, multibyte) if (cp->next == 0) { compile_it: - compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte); + compile_pattern_1 (cp, pattern, translate, regp, posix); break; } } @@ -303,6 +273,10 @@ compile_pattern (pattern, regp, translate, posix, multibyte) if (regp) re_set_registers (&cp->buf, regp, regp->num_regs, regp->start, regp->end); + /* The compiled pattern can be used both for mulitbyte and unibyte + target. But, we have to tell which the pattern is used for. */ + cp->buf.target_multibyte = multibyte; + return &cp->buf; } @@ -1264,7 +1238,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, unsigned char *base_pat; /* Set to positive if we find a non-ASCII char that need translation. Otherwise set to zero later. */ - int charset_base = -1; + int char_base = -1; int boyer_moore_ok = 1; /* MULTIBYTE says whether the text to be searched is multibyte. @@ -1305,7 +1279,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, /* Copy and optionally translate the pattern. */ len = raw_pattern_size; len_byte = raw_pattern_size_byte; - patbuf = (unsigned char *) alloca (len_byte); + patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH); pat = patbuf; base_pat = raw_pattern; if (multibyte) @@ -1355,46 +1329,40 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, if (c != inverse && boyer_moore_ok) { /* Check if all equivalents belong to the same - charset & row. Note that the check of C - itself is done by the last iteration. Note - also that we don't have to check ASCII - characters because boyer-moore search can - always handle their translation. */ - while (1) + group of characters. Note that the check of C + itself is done by the last iteration. */ + int this_char_base = -1; + + while (boyer_moore_ok) { if (ASCII_BYTE_P (inverse)) { - if (charset_base > 0) - { - boyer_moore_ok = 0; - break; - } - charset_base = 0; + if (this_char_base > 0) + boyer_moore_ok = 0; + else + this_char_base = 0; } - else if (SINGLE_BYTE_CHAR_P (inverse)) + else if (CHAR_BYTE8_P (inverse)) + /* Boyer-moore search can't handle a + translation of an eight-bit + character. */ + boyer_moore_ok = 0; + else if (this_char_base < 0) { - /* Boyer-moore search can't handle a - translation of an eight-bit - character. */ - boyer_moore_ok = 0; - break; - } - else if (charset_base < 0) - charset_base = inverse & ~CHAR_FIELD3_MASK; - else if ((inverse & ~CHAR_FIELD3_MASK) - != charset_base) - { - boyer_moore_ok = 0; - break; + this_char_base = inverse & ~0x3F; + if (char_base < 0) + char_base = this_char_base; + else if (this_char_base != char_base) + boyer_moore_ok = 0; } + else if ((inverse & ~0x3F) != this_char_base) + boyer_moore_ok = 0; if (c == inverse) break; TRANSLATE (inverse, inverse_trt, inverse); } } } - if (charset_base < 0) - charset_base = 0; /* Store this character into the translated pattern. */ bcopy (str, pat, charlen); @@ -1402,11 +1370,16 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, base_pat += in_charlen; len_byte -= in_charlen; } + + /* If char_base is still negative we didn't find any translated + non-ASCII characters. */ + if (char_base < 0) + char_base = 0; } else { /* Unibyte buffer. */ - charset_base = 0; + char_base = 0; while (--len >= 0) { int c, translated; @@ -1433,7 +1406,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, if (boyer_moore_ok) return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, pos, pos_byte, lim, lim_byte, - charset_base); + char_base); else return simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte); @@ -1463,6 +1436,9 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) { int multibyte = ! NILP (current_buffer->enable_multibyte_characters); int forward = n > 0; + /* Number of buffer bytes matched. Note that this may be different + from len_byte in a multibyte buffer. */ + int match_byte; if (lim > pos && multibyte) while (n > 0) @@ -1475,7 +1451,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) int this_len = len; int this_len_byte = len_byte; unsigned char *p = pat; - if (pos + len > lim) + if (pos + len > lim || pos_byte + len_byte > lim_byte) goto stop; while (this_len > 0) @@ -1502,8 +1478,9 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) if (this_len == 0) { + match_byte = this_pos_byte - pos_byte; pos += len; - pos_byte += len_byte; + pos_byte += match_byte; break; } @@ -1540,6 +1517,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) if (this_len == 0) { + match_byte = len; pos += len; break; } @@ -1557,13 +1535,15 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) { /* Try matching at position POS. */ int this_pos = pos - len; - int this_pos_byte = pos_byte - len_byte; + int this_pos_byte; int this_len = len; int this_len_byte = len_byte; unsigned char *p = pat; - if (this_pos < lim || this_pos_byte < lim_byte) + if (this_pos < lim || (pos_byte - len_byte) < lim_byte) goto stop; + this_pos_byte = CHAR_TO_BYTE (this_pos); + match_byte = pos_byte - this_pos_byte; while (this_len > 0) { @@ -1589,7 +1569,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) if (this_len == 0) { pos -= len; - pos_byte -= len_byte; + pos_byte -= match_byte; break; } @@ -1608,7 +1588,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) int this_len = len; unsigned char *p = pat; - if (pos - len < lim) + if (this_pos < lim) goto stop; while (this_len > 0) @@ -1625,6 +1605,7 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) if (this_len == 0) { + match_byte = len; pos -= len; break; } @@ -1639,9 +1620,9 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) if (n == 0) { if (forward) - set_search_regs ((multibyte ? pos_byte : pos) - len_byte, len_byte); + set_search_regs ((multibyte ? pos_byte : pos) - match_byte, match_byte); else - set_search_regs (multibyte ? pos_byte : pos, len_byte); + set_search_regs (multibyte ? pos_byte : pos, match_byte); return pos; } @@ -1662,13 +1643,13 @@ simple_search (n, pat, len, len_byte, trt, pos, pos_byte, lim, lim_byte) have nontrivial translation are the same aside from the last byte. This makes it possible to translate just the last byte of a character, and do so after just a simple test of the context. - CHARSET_BASE is nonzero if there is such a non-ASCII character. + CHAR_BASE is nonzero if there is such a non-ASCII character. If that criterion is not satisfied, do not call this function. */ static int boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, - pos, pos_byte, lim, lim_byte, charset_base) + pos, pos_byte, lim, lim_byte, char_base) int n; unsigned char *base_pat; int len, len_byte; @@ -1676,7 +1657,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, Lisp_Object inverse_trt; int pos, pos_byte; int lim, lim_byte; - int charset_base; + int char_base; { int direction = ((n > 0) ? 1 : -1); register int dirlen; @@ -1690,12 +1671,13 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, unsigned char simple_translate[0400]; /* These are set to the preceding bytes of a byte to be translated - if charset_base is nonzero. As the maximum byte length of a - multibyte character is 4, we have to check at most three previous + if char_base is nonzero. As the maximum byte length of a + multibyte character is 5, we have to check at most four previous bytes. */ int translate_prev_byte1 = 0; int translate_prev_byte2 = 0; int translate_prev_byte3 = 0; + int translate_prev_byte4 = 0; BM_tab = (int *) alloca (0400 * sizeof (int)); @@ -1757,20 +1739,23 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, for (i = 0; i < 0400; i++) simple_translate[i] = i; - if (charset_base) + if (char_base) { - /* Setup translate_prev_byte1/2/3 from CHARSET_BASE. Only a + /* Setup translate_prev_byte1/2/3/4 from CHAR_BASE. Only a byte following them are the target of translation. */ - int sample_char = charset_base | 0x20; unsigned char str[MAX_MULTIBYTE_LENGTH]; - int len = CHAR_STRING (sample_char, str); + int len = CHAR_STRING (char_base, str); translate_prev_byte1 = str[len - 2]; if (len > 2) { translate_prev_byte2 = str[len - 3]; if (len > 3) - translate_prev_byte3 = str[len - 4]; + { + translate_prev_byte3 = str[len - 4]; + if (len > 4) + translate_prev_byte4 = str[len - 5]; + } } } @@ -1786,12 +1771,12 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, /* If the byte currently looking at is the last of a character to check case-equivalents, set CH to that character. An ASCII character and a non-ASCII character - matching with CHARSET_BASE are to be checked. */ + matching with CHAR_BASE are to be checked. */ int ch = -1; if (ASCII_BYTE_P (*ptr) || ! multibyte) ch = *ptr; - else if (charset_base + else if (char_base && ((pat_end - ptr) == 1 || CHAR_HEAD_P (ptr[1]))) { unsigned char *charstart = ptr - 1; @@ -1799,12 +1784,12 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, while (! (CHAR_HEAD_P (*charstart))) charstart--; ch = STRING_CHAR (charstart, ptr - charstart + 1); - if (charset_base != (ch & ~CHAR_FIELD3_MASK)) + if (char_base != (ch & ~0x3F)) ch = -1; } - if (ch >= 0400) - j = ((unsigned char) ch) | 0200; + if (ch >= 0200) + j = (ch & 0x3F) | 0200; else j = *ptr; @@ -1822,10 +1807,10 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, while (1) { TRANSLATE (ch, inverse_trt, ch); - if (ch >= 0400) - j = ((unsigned char) ch) | 0200; + if (ch >= 0200) + j = (ch & 0x3F) | 0200; else - j = (unsigned char) ch; + j = ch; /* For all the characters that map into CH, set up simple_translate to map the last byte @@ -2130,19 +2115,21 @@ set_search_regs (beg_byte, nbytes) XSETBUFFER (last_thing_searched, current_buffer); } -/* Given a string of words separated by word delimiters, - compute a regexp that matches those exact words - separated by arbitrary punctuation. */ +/* Given STRING, a string of words separated by word delimiters, + compute a regexp that matches those exact words separated by + arbitrary punctuation. If LAX is nonzero, the end of the string + need not match a word boundary unless it ends in whitespace. */ static Lisp_Object -wordify (string) +wordify (string, lax) Lisp_Object string; + int lax; { register unsigned char *p, *o; register int i, i_byte, len, punct_count = 0, word_count = 0; Lisp_Object val; int prev_c = 0; - int adjust; + int adjust, whitespace_at_end; CHECK_STRING (string); p = SDATA (string); @@ -2152,7 +2139,7 @@ wordify (string) { int c; - FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte); + FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte); if (SYNTAX (c) != Sword) { @@ -2165,11 +2152,18 @@ wordify (string) } if (SYNTAX (prev_c) == Sword) - word_count++; + { + word_count++; + whitespace_at_end = 0; + } + else + whitespace_at_end = 1; + if (!word_count) return empty_unibyte_string; - adjust = - punct_count + 5 * (word_count - 1) + 4; + adjust = - punct_count + 5 * (word_count - 1) + + ((lax && !whitespace_at_end) ? 2 : 4); if (STRING_MULTIBYTE (string)) val = make_uninit_multibyte_string (len + adjust, SBYTES (string) @@ -2187,7 +2181,7 @@ wordify (string) int c; int i_byte_orig = i_byte; - FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte); + FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte); if (SYNTAX (c) == Sword) { @@ -2207,8 +2201,11 @@ wordify (string) prev_c = c; } - *o++ = '\\'; - *o++ = 'b'; + if (!lax || whitespace_at_end) + { + *o++ = '\\'; + *o++ = 'b'; + } return val; } @@ -2265,7 +2262,7 @@ Optional fourth argument is repeat count--search for successive occurrences. */ (string, bound, noerror, count) Lisp_Object string, bound, noerror, count; { - return search_command (wordify (string), bound, noerror, count, -1, 1, 0); + return search_command (wordify (string, 0), bound, noerror, count, -1, 1, 0); } DEFUN ("word-search-forward", Fword_search_forward, Sword_search_forward, 1, 4, @@ -2280,7 +2277,45 @@ Optional fourth argument is repeat count--search for successive occurrences. */ (string, bound, noerror, count) Lisp_Object string, bound, noerror, count; { - return search_command (wordify (string), bound, noerror, count, 1, 1, 0); + return search_command (wordify (string, 0), bound, noerror, count, 1, 1, 0); +} + +DEFUN ("word-search-backward-lax", Fword_search_backward_lax, Sword_search_backward_lax, 1, 4, + "sWord search backward: ", + doc: /* Search backward from point for STRING, ignoring differences in punctuation. +Set point to the beginning of the occurrence found, and return point. + +Unlike `word-search-backward', the end of STRING need not match a word +boundary unless it ends in whitespace. + +An optional second argument bounds the search; it is a buffer position. +The match found must not extend before that position. +Optional third argument, if t, means if fail just return nil (no error). + If not nil and not t, move to limit of search and return nil. +Optional fourth argument is repeat count--search for successive occurrences. */) + (string, bound, noerror, count) + Lisp_Object string, bound, noerror, count; +{ + return search_command (wordify (string, 1), bound, noerror, count, -1, 1, 0); +} + +DEFUN ("word-search-forward-lax", Fword_search_forward_lax, Sword_search_forward_lax, 1, 4, + "sWord search: ", + doc: /* Search forward from point for STRING, ignoring differences in punctuation. +Set point to the end of the occurrence found, and return point. + +Unlike `word-search-forward', the end of STRING need not match a word +boundary unless it ends in whitespace. + +An optional second argument bounds the search; it is a buffer position. +The match found must not extend after that position. +Optional third argument, if t, means if fail just return nil (no error). + If not nil and not t, move to limit of search and return nil. +Optional fourth argument is repeat count--search for successive occurrences. */) + (string, bound, noerror, count) + Lisp_Object string, bound, noerror, count; +{ + return search_command (wordify (string, 1), bound, noerror, count, 1, 1, 0); } DEFUN ("re-search-backward", Fre_search_backward, Sre_search_backward, 1, 4, @@ -2471,11 +2506,11 @@ since only regular expressions have distinguished subexpressions. */) { if (NILP (string)) { - c = FETCH_CHAR (pos_byte); + c = FETCH_CHAR_AS_MULTIBYTE (pos_byte); INC_BOTH (pos, pos_byte); } else - FETCH_STRING_CHAR_ADVANCE (c, string, pos, pos_byte); + FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, pos, pos_byte); if (LOWERCASEP (c)) { @@ -2647,10 +2682,7 @@ since only regular expressions have distinguished subexpressions. */) Lisp_Object rev_tbl; int really_changed = 0; - rev_tbl= (!buf_multibyte && CHAR_TABLE_P (Vnonascii_translation_table) - ? Fchar_table_extra_slot (Vnonascii_translation_table, - make_number (0)) - : Qnil); + rev_tbl = Qnil; substed_alloc_size = length * 2 + 100; substed = (unsigned char *) xmalloc (substed_alloc_size + 1); @@ -2693,7 +2725,7 @@ since only regular expressions have distinguished subexpressions. */) { FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext, pos, pos_byte); - if (!buf_multibyte && !SINGLE_BYTE_CHAR_P (c)) + if (!buf_multibyte && !ASCII_CHAR_P (c)) c = multibyte_char_to_unibyte (c, rev_tbl); } else @@ -3253,6 +3285,8 @@ is to bind it with `let' around a small expression. */); defsubr (&Ssearch_backward); defsubr (&Sword_search_forward); defsubr (&Sword_search_backward); + defsubr (&Sword_search_forward_lax); + defsubr (&Sword_search_backward_lax); defsubr (&Sre_search_forward); defsubr (&Sre_search_backward); defsubr (&Sposix_search_forward);