X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/cb6792d2dbc65e1fcf63a45e82b3d8ac35e8313f..563b67aafd1cdfa239c5ce1f6d3d6fc5567dee39:/src/search.c diff --git a/src/search.c b/src/search.c index 225155d73a..34dcc7e78a 100644 --- a/src/search.c +++ b/src/search.c @@ -1,5 +1,5 @@ /* String search routines for GNU Emacs. - Copyright (C) 1985, 86, 87, 93, 94, 97, 1998 Free Software Foundation, Inc. + Copyright (C) 1985, 86,87,93,94,97,98, 1999 Free Software Foundation, Inc. This file is part of GNU Emacs. @@ -20,9 +20,6 @@ Boston, MA 02111-1307, USA. */ #include -#ifdef STDC_HEADERS -#include -#endif #include "lisp.h" #include "syntax.h" #include "category.h" @@ -100,12 +97,6 @@ matcher_overflow () error ("Stack overflow in regexp matcher"); } -#ifdef __STDC__ -#define CONST const -#else -#define CONST -#endif - /* Compile a regexp and signal a Lisp error if anything goes wrong. PATTERN is the pattern to compile. CP is the place to put the result. @@ -182,6 +173,23 @@ compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte) cp->regexp = Fcopy_sequence (pattern); } +/* Shrink each compiled regexp buffer in the cache + to the size actually used right now. + This is called from garbage collection. */ + +void +shrink_regexp_cache () +{ + struct regexp_cache *cp, **cpp; + + for (cp = searchbuf_head; cp != 0; cp = cp->next) + { + cp->buf.allocated = cp->buf.used; + cp->buf.buffer + = (unsigned char *) realloc (cp->buf.buffer, cp->buf.used); + } +} + /* Compile a regexp if necessary, but first check to see if there's one in the cache. PATTERN is the pattern to compile. @@ -205,6 +213,13 @@ compile_pattern (pattern, regp, translate, posix, multibyte) for (cpp = &searchbuf_head; ; cpp = &cp->next) { cp = *cpp; + /* Entries are initialized to nil, and may be set to nil by + compile_pattern_1 if the pattern isn't valid. Don't apply + XSTRING in those cases. However, compile_pattern_1 is only + applied to the cache entry we pick here to reuse. So nil + should never appear before a non-nil entry. */ + if (NILP (cp->regexp)) + goto compile_it; if (XSTRING (cp->regexp)->size == XSTRING (pattern)->size && !NILP (Fstring_equal (cp->regexp, pattern)) && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0))) @@ -212,9 +227,12 @@ compile_pattern (pattern, regp, translate, posix, multibyte) && cp->buf.multibyte == multibyte) break; - /* If we're at the end of the cache, compile into the last cell. */ + /* If we're at the end of the cache, compile into the nil cell + we found, or the last (least recently used) cell with a + string value. */ if (cp->next == 0) { + compile_it: compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte); break; } @@ -294,6 +312,8 @@ looking_at_1 (string, posix) i = re_match_2 (bufp, (char *) p1, s1, (char *) p2, s2, PT_BYTE - BEGV_BYTE, &search_regs, ZV_BYTE - BEGV_BYTE); + immediate_quit = 0; + if (i == -2) matcher_overflow (); @@ -308,7 +328,6 @@ looking_at_1 (string, posix) = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE); } XSETBUFFER (last_thing_searched, current_buffer); - immediate_quit = 0; return val; } @@ -398,6 +417,7 @@ string_match_1 (regexp, string, start, posix) DEFUN ("string-match", Fstring_match, Sstring_match, 2, 3, 0, "Return index of start of first match for REGEXP in STRING, or nil.\n\ +Case is ignored if `case-fold-search' is non-nil in the current buffer.\n\ If third arg START is non-nil, start search at that index in STRING.\n\ For index of first char beyond the match, do (match-end 0).\n\ `match-end' and `match-beginning' also give indices of substrings\n\ @@ -411,6 +431,7 @@ matched by parenthesis constructs in the pattern.") DEFUN ("posix-string-match", Fposix_string_match, Sposix_string_match, 2, 3, 0, "Return index of start of first match for REGEXP in STRING, or nil.\n\ Find the longest match, in accord with Posix regular expression rules.\n\ +Case is ignored if `case-fold-search' is non-nil in the current buffer.\n\ If third arg START is non-nil, start search at that index in STRING.\n\ For index of first char beyond the match, do (match-end 0).\n\ `match-end' and `match-beginning' also give indices of substrings\n\ @@ -1005,17 +1026,14 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, if (running_asynch_code) save_search_regs (); + /* Searching 0 times means don't move. */ /* Null string is found at starting position. */ - if (len == 0) + if (len == 0 || n == 0) { set_search_regs (pos, 0); return pos; } - /* Searching 0 times means don't move. */ - if (n == 0) - return pos; - if (RE && !trivial_regexp_p (string)) { unsigned char *p1, *p2; @@ -1128,7 +1146,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, int multibyte = !NILP (current_buffer->enable_multibyte_characters); unsigned char *base_pat = XSTRING (string)->data; int charset_base = -1; - int simple = 1; + int boyer_moore_ok = 1; /* MULTIBYTE says whether the text to be searched is multibyte. We must convert PATTERN to match that, or we will not really @@ -1175,7 +1193,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, { while (--len >= 0) { - unsigned char workbuf[4], *str; + unsigned char str[MAX_MULTIBYTE_LENGTH]; int c, translated, inverse; int in_charlen, charlen; @@ -1190,17 +1208,26 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, } c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen); + /* Translate the character, if requested. */ TRANSLATE (translated, trt, c); /* If translation changed the byte-length, go back to the original character. */ - charlen = CHAR_STRING (translated, workbuf, str); + charlen = CHAR_STRING (translated, str); if (in_charlen != charlen) { translated = c; - charlen = CHAR_STRING (c, workbuf, str); + charlen = CHAR_STRING (c, str); } + /* If we are searching for something strange, + an invalid multibyte code, don't use boyer-moore. */ + if (! ASCII_BYTE_P (translated) + && (charlen == 1 /* 8bit code */ + || charlen != in_charlen /* invalid multibyte code */ + )) + boyer_moore_ok = 0; + TRANSLATE (inverse, inverse_trt, c); /* Did this char actually get translated? @@ -1209,15 +1236,13 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, { /* Keep track of which character set row contains the characters that need translation. */ - int charset_base_code = c & ~0xff; + int charset_base_code = c & ~CHAR_FIELD3_MASK; if (charset_base == -1) charset_base = charset_base_code; else if (charset_base != charset_base_code) /* If two different rows appear, needing translation, then we cannot use boyer_moore search. */ - simple = 0; - /* ??? Handa: this must do simple = 0 - if c is a composite character. */ + boyer_moore_ok = 0; } /* Store this character into the translated pattern. */ @@ -1229,9 +1254,11 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, } else { + /* Unibyte buffer. */ + charset_base = 0; while (--len >= 0) { - int c, translated, inverse; + int c, translated; /* If we got here and the RE flag is set, it's because we're dealing with a regexp known to be trivial, so the backslash @@ -1243,22 +1270,6 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, } c = *base_pat++; TRANSLATE (translated, trt, c); - TRANSLATE (inverse, inverse_trt, c); - - /* Did this char actually get translated? - Would any other char get translated into it? */ - if (translated != c || inverse != c) - { - /* Keep track of which character set row - contains the characters that need translation. */ - int charset_base_code = c & ~0xff; - if (charset_base == -1) - charset_base = charset_base_code; - else if (charset_base != charset_base_code) - /* If two different rows appear, needing translation, - then we cannot use boyer_moore search. */ - simple = 0; - } *pat++ = translated; } } @@ -1267,7 +1278,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n, len = raw_pattern_size; pat = base_pat = patbuf; - if (simple) + if (boyer_moore_ok) return boyer_moore (n, pat, len, len_byte, trt, inverse_trt, pos, pos_byte, lim, lim_byte, charset_base); @@ -1612,7 +1623,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, while (! CHAR_HEAD_P (*charstart)) charstart--; untranslated = STRING_CHAR (charstart, ptr - charstart + 1); - if (charset_base == (untranslated & ~0xff)) + if (charset_base == (untranslated & ~CHAR_FIELD3_MASK)) { TRANSLATE (ch, trt, untranslated); if (! CHAR_HEAD_P (*ptr)) @@ -1896,12 +1907,15 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt, } /* Record beginning BEG_BYTE and end BEG_BYTE + NBYTES - for a match just found in the current buffer. */ + for the overall match just found in the current buffer. + Also clear out the match data for registers 1 and up. */ static void set_search_regs (beg_byte, nbytes) int beg_byte, nbytes; { + int i; + /* Make sure we have registers in which to store the match position. */ if (search_regs.num_regs == 0) @@ -1911,6 +1925,13 @@ set_search_regs (beg_byte, nbytes) search_regs.num_regs = 2; } + /* Clear out the other registers. */ + for (i = 1; i < search_regs.num_regs; i++) + { + search_regs.start[i] = -1; + search_regs.end[i] = -1; + } + search_regs.start[0] = BYTE_TO_CHAR (beg_byte); search_regs.end[0] = BYTE_TO_CHAR (beg_byte + nbytes); XSETBUFFER (last_thing_searched, current_buffer); @@ -1959,8 +1980,12 @@ wordify (string) return build_string (""); adjust = - punct_count + 5 * (word_count - 1) + 4; - val = make_uninit_multibyte_string (len + adjust, - STRING_BYTES (XSTRING (string)) + adjust); + if (STRING_MULTIBYTE (string)) + val = make_uninit_multibyte_string (len + adjust, + STRING_BYTES (XSTRING (string)) + + adjust); + else + val = make_uninit_string (len + adjust); o = XSTRING (val)->data; *o++ = '\\'; @@ -1975,7 +2000,10 @@ wordify (string) if (STRING_MULTIBYTE (string)) FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte); else - c = XSTRING (string)->data[i++]; + { + c = XSTRING (string)->data[i++]; + i_byte++; + } if (SYNTAX (c) == Sword) { @@ -2163,7 +2191,7 @@ since only regular expressions have distinguished subexpressions.") Lisp_Object newtext, fixedcase, literal, string, subexp; { enum { nochange, all_caps, cap_initial } case_action; - register int pos, last; + register int pos, pos_byte; int some_multiletter_word; int some_lowercase; int some_uppercase; @@ -2213,18 +2241,16 @@ since only regular expressions have distinguished subexpressions.") if (NILP (fixedcase)) { - int beg; /* Decide how to casify by examining the matched text. */ + int last; - if (NILP (string)) - last = CHAR_TO_BYTE (search_regs.end[sub]); - else - last = search_regs.end[sub]; + pos = search_regs.start[sub]; + last = search_regs.end[sub]; if (NILP (string)) - beg = CHAR_TO_BYTE (search_regs.start[sub]); + pos_byte = CHAR_TO_BYTE (pos); else - beg = search_regs.start[sub]; + pos_byte = string_char_to_byte (string, pos); prevc = '\n'; case_action = all_caps; @@ -2236,12 +2262,15 @@ since only regular expressions have distinguished subexpressions.") some_nonuppercase_initial = 0; some_uppercase = 0; - for (pos = beg; pos < last; pos++) + while (pos < last) { if (NILP (string)) - c = FETCH_BYTE (pos); + { + c = FETCH_CHAR (pos_byte); + INC_BOTH (pos, pos_byte); + } else - c = XSTRING (string)->data[pos]; + FETCH_STRING_CHAR_ADVANCE (c, string, pos, pos_byte); if (LOWERCASEP (c)) { @@ -2300,16 +2329,16 @@ since only regular expressions have distinguished subexpressions.") if desired. */ if (NILP (literal)) { - int lastpos = -1; - int lastpos_byte = -1; + int lastpos = 0; + int lastpos_byte = 0; /* We build up the substituted string in ACCUM. */ Lisp_Object accum; Lisp_Object middle; - int pos_byte; + int length = STRING_BYTES (XSTRING (newtext)); accum = Qnil; - for (pos_byte = 0, pos = 0; pos_byte < STRING_BYTES (XSTRING (newtext));) + for (pos_byte = 0, pos = 0; pos_byte < length;) { int substart = -1; int subend; @@ -2340,10 +2369,10 @@ since only regular expressions have distinguished subexpressions.") } if (substart >= 0) { - if (pos - 1 != lastpos + 1) - middle = substring_both (newtext, lastpos + 1, - lastpos_byte + 1, - pos - 1, pos_byte - 1); + if (pos - 2 != lastpos) + middle = substring_both (newtext, lastpos, + lastpos_byte, + pos - 2, pos_byte - 2); else middle = Qnil; accum = concat3 (accum, middle, @@ -2355,9 +2384,9 @@ since only regular expressions have distinguished subexpressions.") } else if (delbackslash) { - middle = substring_both (newtext, lastpos + 1, - lastpos_byte + 1, - pos, pos_byte); + middle = substring_both (newtext, lastpos, + lastpos_byte, + pos - 1, pos_byte - 1); accum = concat2 (accum, middle); lastpos = pos; @@ -2365,9 +2394,9 @@ since only regular expressions have distinguished subexpressions.") } } - if (pos != lastpos + 1) - middle = substring_both (newtext, lastpos + 1, - lastpos_byte + 1, + if (pos != lastpos) + middle = substring_both (newtext, lastpos, + lastpos_byte, pos, pos_byte); else middle = Qnil; @@ -2385,8 +2414,10 @@ since only regular expressions have distinguished subexpressions.") } /* Record point, the move (quietly) to the start of the match. */ - if (PT > search_regs.start[sub]) + if (PT >= search_regs.end[sub]) opoint = PT - ZV; + else if (PT > search_regs.start[sub]) + opoint = search_regs.end[sub] - ZV; else opoint = PT; @@ -2400,39 +2431,118 @@ since only regular expressions have distinguished subexpressions.") Finsert_and_inherit (1, &newtext); else { - struct gcpro gcpro1; - GCPRO1 (newtext); - - for (pos = 0; pos < XSTRING (newtext)->size; pos++) + int length = STRING_BYTES (XSTRING (newtext)); + unsigned char *substed; + int substed_alloc_size, substed_len; + int buf_multibyte = !NILP (current_buffer->enable_multibyte_characters); + int str_multibyte = STRING_MULTIBYTE (newtext); + Lisp_Object rev_tbl; + + rev_tbl= (!buf_multibyte && CHAR_TABLE_P (Vnonascii_translation_table) + ? Fchar_table_extra_slot (Vnonascii_translation_table, + make_number (0)) + : Qnil); + + substed_alloc_size = length * 2 + 100; + substed = (unsigned char *) xmalloc (substed_alloc_size + 1); + substed_len = 0; + + /* Go thru NEWTEXT, producing the actual text to insert in + SUBSTED while adjusting multibyteness to that of the current + buffer. */ + + for (pos_byte = 0, pos = 0; pos_byte < length;) { - int offset = PT - search_regs.start[sub]; + unsigned char str[MAX_MULTIBYTE_LENGTH]; + unsigned char *add_stuff; + int add_len; + int idx = -1; + + if (str_multibyte) + { + FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte); + if (!buf_multibyte) + c = multibyte_char_to_unibyte (c, rev_tbl); + } + else + { + /* Note that we don't have to increment POS. */ + c = XSTRING (newtext)->data[pos_byte++]; + if (buf_multibyte) + c = unibyte_char_to_multibyte (c); + } + + /* Either set ADD_STUFF and ADD_LEN to the text to put in SUBSTED, + or set IDX to a match index, which means put that part + of the buffer text into SUBSTED. */ - c = XSTRING (newtext)->data[pos]; if (c == '\\') { - c = XSTRING (newtext)->data[++pos]; + if (str_multibyte) + { + FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte); + if (!buf_multibyte && !SINGLE_BYTE_CHAR_P (c)) + c = multibyte_char_to_unibyte (c, rev_tbl); + } + else + { + c = XSTRING (newtext)->data[pos_byte++]; + if (buf_multibyte) + c = unibyte_char_to_multibyte (c); + } + if (c == '&') - Finsert_buffer_substring - (Fcurrent_buffer (), - make_number (search_regs.start[sub] + offset), - make_number (search_regs.end[sub] + offset)); + idx = sub; else if (c >= '1' && c <= '9' && c <= search_regs.num_regs + '0') { if (search_regs.start[c - '0'] >= 1) - Finsert_buffer_substring - (Fcurrent_buffer (), - make_number (search_regs.start[c - '0'] + offset), - make_number (search_regs.end[c - '0'] + offset)); + idx = c - '0'; } else if (c == '\\') - insert_char (c); + add_len = 1, add_stuff = "\\"; else - error ("Invalid use of `\\' in replacement text"); + { + xfree (substed); + error ("Invalid use of `\\' in replacement text"); + } } else - insert_char (c); + { + add_len = CHAR_STRING (c, str); + add_stuff = str; + } + + /* If we want to copy part of a previous match, + set up ADD_STUFF and ADD_LEN to point to it. */ + if (idx >= 0) + { + int begbyte = CHAR_TO_BYTE (search_regs.start[idx]); + add_len = CHAR_TO_BYTE (search_regs.end[idx]) - begbyte; + if (search_regs.start[idx] < GPT && GPT < search_regs.end[idx]) + move_gap (search_regs.start[idx]); + add_stuff = BYTE_POS_ADDR (begbyte); + } + + /* Now the stuff we want to add to SUBSTED + is invariably ADD_LEN bytes starting at ADD_STUFF. */ + + /* Make sure SUBSTED is big enough. */ + if (substed_len + add_len >= substed_alloc_size) + { + substed_alloc_size = substed_len + add_len + 500; + substed = (unsigned char *) xrealloc (substed, + substed_alloc_size + 1); + } + + /* Now add to the end of SUBSTED. */ + bcopy (add_stuff, substed + substed_len, add_len); + substed_len += add_len; } - UNGCPRO; + + /* Now insert what we accumulated. */ + insert_and_inherit (substed, substed_len); + + xfree (substed); } inslen = PT - (search_regs.start[sub]); @@ -2565,19 +2675,19 @@ to hold all the values, and if INTEGERS is non-nil, no consing is done.") /* If REUSE is a list, store as many value elements as will fit into the elements of REUSE. */ for (i = 0, tail = reuse; CONSP (tail); - i++, tail = XCONS (tail)->cdr) + i++, tail = XCDR (tail)) { if (i < 2 * len + 2) - XCONS (tail)->car = data[i]; + XCAR (tail) = data[i]; else - XCONS (tail)->car = Qnil; + XCAR (tail) = Qnil; prev = tail; } /* If we couldn't fit all value elements into REUSE, cons up the rest of them and add them to the end of REUSE. */ if (i < 2 * len + 2) - XCONS (prev)->cdr = Flist (2 * len + 2 - i, data + i); + XCDR (prev) = Flist (2 * len + 2 - i, data + i); return reuse; }