X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/cb6792d2dbc65e1fcf63a45e82b3d8ac35e8313f..563b67aafd1cdfa239c5ce1f6d3d6fc5567dee39:/src/search.c

diff --git a/src/search.c b/src/search.c
index 225155d73a..34dcc7e78a 100644
--- a/src/search.c
+++ b/src/search.c
@@ -1,5 +1,5 @@
 /* String search routines for GNU Emacs.
-   Copyright (C) 1985, 86, 87, 93, 94, 97, 1998 Free Software Foundation, Inc.
+   Copyright (C) 1985, 86,87,93,94,97,98, 1999 Free Software Foundation, Inc.
 
 This file is part of GNU Emacs.
 
@@ -20,9 +20,6 @@ Boston, MA 02111-1307, USA.  */
 
 
 #include <config.h>
-#ifdef STDC_HEADERS
-#include <stdlib.h>
-#endif
 #include "lisp.h"
 #include "syntax.h"
 #include "category.h"
@@ -100,12 +97,6 @@ matcher_overflow ()
   error ("Stack overflow in regexp matcher");
 }
 
-#ifdef __STDC__
-#define CONST const
-#else
-#define CONST
-#endif
-
 /* Compile a regexp and signal a Lisp error if anything goes wrong.
    PATTERN is the pattern to compile.
    CP is the place to put the result.
@@ -182,6 +173,23 @@ compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte)
   cp->regexp = Fcopy_sequence (pattern);
 }
 
+/* Shrink each compiled regexp buffer in the cache
+   to the size actually used right now.
+   This is called from garbage collection.  */
+
+void
+shrink_regexp_cache ()
+{
+  struct regexp_cache *cp, **cpp;
+
+  for (cp = searchbuf_head; cp != 0; cp = cp->next)
+    {
+      cp->buf.allocated = cp->buf.used;
+      cp->buf.buffer
+	= (unsigned char *) realloc (cp->buf.buffer, cp->buf.used);
+    }
+}
+
 /* Compile a regexp if necessary, but first check to see if there's one in
    the cache.
    PATTERN is the pattern to compile.
@@ -205,6 +213,13 @@ compile_pattern (pattern, regp, translate, posix, multibyte)
   for (cpp = &searchbuf_head; ; cpp = &cp->next)
     {
       cp = *cpp;
+      /* Entries are initialized to nil, and may be set to nil by
+	 compile_pattern_1 if the pattern isn't valid.  Don't apply
+	 XSTRING in those cases.  However, compile_pattern_1 is only
+	 applied to the cache entry we pick here to reuse.  So nil
+	 should never appear before a non-nil entry.  */
+      if (NILP (cp->regexp))
+	goto compile_it;
       if (XSTRING (cp->regexp)->size == XSTRING (pattern)->size
 	  && !NILP (Fstring_equal (cp->regexp, pattern))
 	  && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0)))
@@ -212,9 +227,12 @@ compile_pattern (pattern, regp, translate, posix, multibyte)
 	  && cp->buf.multibyte == multibyte)
 	break;
 
-      /* If we're at the end of the cache, compile into the last cell.  */
+      /* If we're at the end of the cache, compile into the nil cell
+	 we found, or the last (least recently used) cell with a
+	 string value.  */
       if (cp->next == 0)
 	{
+	compile_it:
 	  compile_pattern_1 (cp, pattern, translate, regp, posix, multibyte);
 	  break;
 	}
@@ -294,6 +312,8 @@ looking_at_1 (string, posix)
   i = re_match_2 (bufp, (char *) p1, s1, (char *) p2, s2,
 		  PT_BYTE - BEGV_BYTE, &search_regs,
 		  ZV_BYTE - BEGV_BYTE);
+  immediate_quit = 0;
+  
   if (i == -2)
     matcher_overflow ();
 
@@ -308,7 +328,6 @@ looking_at_1 (string, posix)
 	    = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
 	}
   XSETBUFFER (last_thing_searched, current_buffer);
-  immediate_quit = 0;
   return val;
 }
 
@@ -398,6 +417,7 @@ string_match_1 (regexp, string, start, posix)
 
 DEFUN ("string-match", Fstring_match, Sstring_match, 2, 3, 0,
   "Return index of start of first match for REGEXP in STRING, or nil.\n\
+Case is ignored if `case-fold-search' is non-nil in the current buffer.\n\
 If third arg START is non-nil, start search at that index in STRING.\n\
 For index of first char beyond the match, do (match-end 0).\n\
 `match-end' and `match-beginning' also give indices of substrings\n\
@@ -411,6 +431,7 @@ matched by parenthesis constructs in the pattern.")
 DEFUN ("posix-string-match", Fposix_string_match, Sposix_string_match, 2, 3, 0,
   "Return index of start of first match for REGEXP in STRING, or nil.\n\
 Find the longest match, in accord with Posix regular expression rules.\n\
+Case is ignored if `case-fold-search' is non-nil in the current buffer.\n\
 If third arg START is non-nil, start search at that index in STRING.\n\
 For index of first char beyond the match, do (match-end 0).\n\
 `match-end' and `match-beginning' also give indices of substrings\n\
@@ -1005,17 +1026,14 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
   if (running_asynch_code)
     save_search_regs ();
 
+  /* Searching 0 times means don't move.  */
   /* Null string is found at starting position.  */
-  if (len == 0)
+  if (len == 0 || n == 0)
     {
       set_search_regs (pos, 0);
       return pos;
     }
 
-  /* Searching 0 times means don't move.  */
-  if (n == 0)
-    return pos;
-
   if (RE && !trivial_regexp_p (string))
     {
       unsigned char *p1, *p2;
@@ -1128,7 +1146,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
       int multibyte = !NILP (current_buffer->enable_multibyte_characters);
       unsigned char *base_pat = XSTRING (string)->data;
       int charset_base = -1;
-      int simple = 1;
+      int boyer_moore_ok = 1;
 
       /* MULTIBYTE says whether the text to be searched is multibyte.
 	 We must convert PATTERN to match that, or we will not really
@@ -1175,7 +1193,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
 	{
 	  while (--len >= 0)
 	    {
-	      unsigned char workbuf[4], *str;
+	      unsigned char str[MAX_MULTIBYTE_LENGTH];
 	      int c, translated, inverse;
 	      int in_charlen, charlen;
 
@@ -1190,17 +1208,26 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
 		}
 
 	      c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
+
 	      /* Translate the character, if requested.  */
 	      TRANSLATE (translated, trt, c);
 	      /* If translation changed the byte-length, go back
 		 to the original character.  */
-	      charlen = CHAR_STRING (translated, workbuf, str);
+	      charlen = CHAR_STRING (translated, str);
 	      if (in_charlen != charlen)
 		{
 		  translated = c;
-		  charlen = CHAR_STRING (c, workbuf, str);
+		  charlen = CHAR_STRING (c, str);
 		}
 
+	      /* If we are searching for something strange,
+		 an invalid multibyte code, don't use boyer-moore.  */
+	      if (! ASCII_BYTE_P (translated)
+		  && (charlen == 1 /* 8bit code */
+		      || charlen != in_charlen /* invalid multibyte code */
+		      ))
+		boyer_moore_ok = 0;
+
 	      TRANSLATE (inverse, inverse_trt, c);
 
 	      /* Did this char actually get translated?
@@ -1209,15 +1236,13 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
 		{
 		  /* Keep track of which character set row
 		     contains the characters that need translation.  */
-		  int charset_base_code = c & ~0xff;
+		  int charset_base_code = c & ~CHAR_FIELD3_MASK;
 		  if (charset_base == -1)
 		    charset_base = charset_base_code;
 		  else if (charset_base != charset_base_code)
 		    /* If two different rows appear, needing translation,
 		       then we cannot use boyer_moore search.  */
-		    simple = 0;
-		    /* ??? Handa: this must do simple = 0
-		       if c is a composite character.  */
+		    boyer_moore_ok = 0;
 		}
 
 	      /* Store this character into the translated pattern.  */
@@ -1229,9 +1254,11 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
 	}
       else
 	{
+	  /* Unibyte buffer.  */
+	  charset_base = 0;
 	  while (--len >= 0)
 	    {
-	      int c, translated, inverse;
+	      int c, translated;
 
 	      /* If we got here and the RE flag is set, it's because we're
 		 dealing with a regexp known to be trivial, so the backslash
@@ -1243,22 +1270,6 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
 		}
 	      c = *base_pat++;
 	      TRANSLATE (translated, trt, c);
-	      TRANSLATE (inverse, inverse_trt, c);
-
-	      /* Did this char actually get translated?
-		 Would any other char get translated into it?  */
-	      if (translated != c || inverse != c)
-		{
-		  /* Keep track of which character set row
-		     contains the characters that need translation.  */
-		  int charset_base_code = c & ~0xff;
-		  if (charset_base == -1)
-		    charset_base = charset_base_code;
-		  else if (charset_base != charset_base_code)
-		    /* If two different rows appear, needing translation,
-		       then we cannot use boyer_moore search.  */
-		    simple = 0;
-		}
 	      *pat++ = translated;
 	    }
 	}
@@ -1267,7 +1278,7 @@ search_buffer (string, pos, pos_byte, lim, lim_byte, n,
       len = raw_pattern_size;
       pat = base_pat = patbuf;
 
-      if (simple)
+      if (boyer_moore_ok)
 	return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
 			    pos, pos_byte, lim, lim_byte,
 			    charset_base);
@@ -1612,7 +1623,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
 	      while (! CHAR_HEAD_P (*charstart))
 		charstart--;
 	      untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
-	      if (charset_base == (untranslated & ~0xff))
+	      if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
 		{
 		  TRANSLATE (ch, trt, untranslated);
 		  if (! CHAR_HEAD_P (*ptr))
@@ -1896,12 +1907,15 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
 }
 
 /* Record beginning BEG_BYTE and end BEG_BYTE + NBYTES
-   for a match just found in the current buffer.  */
+   for the overall match just found in the current buffer.
+   Also clear out the match data for registers 1 and up.  */
 
 static void
 set_search_regs (beg_byte, nbytes)
      int beg_byte, nbytes;
 {
+  int i;
+
   /* Make sure we have registers in which to store
      the match position.  */
   if (search_regs.num_regs == 0)
@@ -1911,6 +1925,13 @@ set_search_regs (beg_byte, nbytes)
       search_regs.num_regs = 2;
     }
 
+  /* Clear out the other registers.  */
+  for (i = 1; i < search_regs.num_regs; i++)
+    {
+      search_regs.start[i] = -1;
+      search_regs.end[i] = -1;
+    }
+
   search_regs.start[0] = BYTE_TO_CHAR (beg_byte);
   search_regs.end[0] = BYTE_TO_CHAR (beg_byte + nbytes);
   XSETBUFFER (last_thing_searched, current_buffer);
@@ -1959,8 +1980,12 @@ wordify (string)
     return build_string ("");
 
   adjust = - punct_count + 5 * (word_count - 1) + 4;
-  val = make_uninit_multibyte_string (len + adjust,
-				      STRING_BYTES (XSTRING (string)) + adjust);
+  if (STRING_MULTIBYTE (string))
+    val = make_uninit_multibyte_string (len + adjust,
+					STRING_BYTES (XSTRING (string))
+					+ adjust);
+  else
+    val = make_uninit_string (len + adjust);
 
   o = XSTRING (val)->data;
   *o++ = '\\';
@@ -1975,7 +2000,10 @@ wordify (string)
       if (STRING_MULTIBYTE (string))
 	FETCH_STRING_CHAR_ADVANCE (c, string, i, i_byte);
       else
-	c = XSTRING (string)->data[i++];
+	{
+	  c = XSTRING (string)->data[i++];
+	  i_byte++;
+	}
 
       if (SYNTAX (c) == Sword)
 	{
@@ -2163,7 +2191,7 @@ since only regular expressions have distinguished subexpressions.")
      Lisp_Object newtext, fixedcase, literal, string, subexp;
 {
   enum { nochange, all_caps, cap_initial } case_action;
-  register int pos, last;
+  register int pos, pos_byte;
   int some_multiletter_word;
   int some_lowercase;
   int some_uppercase;
@@ -2213,18 +2241,16 @@ since only regular expressions have distinguished subexpressions.")
 
   if (NILP (fixedcase))
     {
-      int beg;
       /* Decide how to casify by examining the matched text. */
+      int last;
 
-      if (NILP (string))
-	last = CHAR_TO_BYTE (search_regs.end[sub]);
-      else
-	last = search_regs.end[sub];
+      pos = search_regs.start[sub];
+      last = search_regs.end[sub];
 
       if (NILP (string))
-	beg = CHAR_TO_BYTE (search_regs.start[sub]);
+	pos_byte = CHAR_TO_BYTE (pos);
       else
-	beg = search_regs.start[sub];
+	pos_byte = string_char_to_byte (string, pos);
 
       prevc = '\n';
       case_action = all_caps;
@@ -2236,12 +2262,15 @@ since only regular expressions have distinguished subexpressions.")
       some_nonuppercase_initial = 0;
       some_uppercase = 0;
 
-      for (pos = beg; pos < last; pos++)
+      while (pos < last)
 	{
 	  if (NILP (string))
-	    c = FETCH_BYTE (pos);
+	    {
+	      c = FETCH_CHAR (pos_byte);
+	      INC_BOTH (pos, pos_byte);
+	    }
 	  else
-	    c = XSTRING (string)->data[pos];
+	    FETCH_STRING_CHAR_ADVANCE (c, string, pos, pos_byte);
 
 	  if (LOWERCASEP (c))
 	    {
@@ -2300,16 +2329,16 @@ since only regular expressions have distinguished subexpressions.")
 	 if desired.  */
       if (NILP (literal))
 	{
-	  int lastpos = -1;
-	  int lastpos_byte = -1;
+	  int lastpos = 0;
+	  int lastpos_byte = 0;
 	  /* We build up the substituted string in ACCUM.  */
 	  Lisp_Object accum;
 	  Lisp_Object middle;
-	  int pos_byte;
+	  int length = STRING_BYTES (XSTRING (newtext));
 
 	  accum = Qnil;
 
-	  for (pos_byte = 0, pos = 0; pos_byte < STRING_BYTES (XSTRING (newtext));)
+	  for (pos_byte = 0, pos = 0; pos_byte < length;)
 	    {
 	      int substart = -1;
 	      int subend;
@@ -2340,10 +2369,10 @@ since only regular expressions have distinguished subexpressions.")
 		}
 	      if (substart >= 0)
 		{
-		  if (pos - 1 != lastpos + 1)
-		    middle = substring_both (newtext, lastpos + 1,
-					     lastpos_byte + 1,
-					     pos - 1, pos_byte - 1);
+		  if (pos - 2 != lastpos)
+		    middle = substring_both (newtext, lastpos,
+					     lastpos_byte,
+					     pos - 2, pos_byte - 2);
 		  else
 		    middle = Qnil;
 		  accum = concat3 (accum, middle,
@@ -2355,9 +2384,9 @@ since only regular expressions have distinguished subexpressions.")
 		}
 	      else if (delbackslash)
 		{
-		  middle = substring_both (newtext, lastpos + 1,
-					   lastpos_byte + 1,
-					   pos, pos_byte);
+		  middle = substring_both (newtext, lastpos,
+					   lastpos_byte,
+					   pos - 1, pos_byte - 1);
 
 		  accum = concat2 (accum, middle);
 		  lastpos = pos;
@@ -2365,9 +2394,9 @@ since only regular expressions have distinguished subexpressions.")
 		}
 	    }
 
-	  if (pos != lastpos + 1)
-	    middle = substring_both (newtext, lastpos + 1,
-				     lastpos_byte + 1,
+	  if (pos != lastpos)
+	    middle = substring_both (newtext, lastpos,
+				     lastpos_byte,
 				     pos, pos_byte);
 	  else
 	    middle = Qnil;
@@ -2385,8 +2414,10 @@ since only regular expressions have distinguished subexpressions.")
     }
 
   /* Record point, the move (quietly) to the start of the match.  */
-  if (PT > search_regs.start[sub])
+  if (PT >= search_regs.end[sub])
     opoint = PT - ZV;
+  else if (PT > search_regs.start[sub])
+    opoint = search_regs.end[sub] - ZV;
   else
     opoint = PT;
 
@@ -2400,39 +2431,118 @@ since only regular expressions have distinguished subexpressions.")
     Finsert_and_inherit (1, &newtext);
   else
     {
-      struct gcpro gcpro1;
-      GCPRO1 (newtext);
-
-      for (pos = 0; pos < XSTRING (newtext)->size; pos++)
+      int length = STRING_BYTES (XSTRING (newtext));
+      unsigned char *substed;
+      int substed_alloc_size, substed_len;
+      int buf_multibyte = !NILP (current_buffer->enable_multibyte_characters);
+      int str_multibyte = STRING_MULTIBYTE (newtext);
+      Lisp_Object rev_tbl;
+
+      rev_tbl= (!buf_multibyte && CHAR_TABLE_P (Vnonascii_translation_table)
+		? Fchar_table_extra_slot (Vnonascii_translation_table,
+					  make_number (0))
+		: Qnil);
+
+      substed_alloc_size = length * 2 + 100;
+      substed = (unsigned char *) xmalloc (substed_alloc_size + 1);
+      substed_len = 0;
+
+      /* Go thru NEWTEXT, producing the actual text to insert in
+	 SUBSTED while adjusting multibyteness to that of the current
+	 buffer.  */
+
+      for (pos_byte = 0, pos = 0; pos_byte < length;)
 	{
-	  int offset = PT - search_regs.start[sub];
+	  unsigned char str[MAX_MULTIBYTE_LENGTH];
+	  unsigned char *add_stuff;
+	  int add_len;
+	  int idx = -1;
+
+	  if (str_multibyte)
+	    {
+	      FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte);
+	      if (!buf_multibyte)
+		c = multibyte_char_to_unibyte (c, rev_tbl);
+	    }
+	  else
+	    {
+	      /* Note that we don't have to increment POS.  */
+	      c = XSTRING (newtext)->data[pos_byte++];
+	      if (buf_multibyte)
+		c = unibyte_char_to_multibyte (c);
+	    }
+
+	  /* Either set ADD_STUFF and ADD_LEN to the text to put in SUBSTED,
+	     or set IDX to a match index, which means put that part
+	     of the buffer text into SUBSTED.  */
 
-	  c = XSTRING (newtext)->data[pos];
 	  if (c == '\\')
 	    {
-	      c = XSTRING (newtext)->data[++pos];
+	      if (str_multibyte)
+		{
+		  FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte);
+		  if (!buf_multibyte && !SINGLE_BYTE_CHAR_P (c))
+		    c = multibyte_char_to_unibyte (c, rev_tbl);
+		}
+	      else
+		{
+		  c = XSTRING (newtext)->data[pos_byte++];
+		  if (buf_multibyte)
+		    c = unibyte_char_to_multibyte (c);
+		}
+
 	      if (c == '&')
-		Finsert_buffer_substring
-		  (Fcurrent_buffer (),
-		   make_number (search_regs.start[sub] + offset),
-		   make_number (search_regs.end[sub] + offset));
+		idx = sub;
 	      else if (c >= '1' && c <= '9' && c <= search_regs.num_regs + '0')
 		{
 		  if (search_regs.start[c - '0'] >= 1)
-		    Finsert_buffer_substring
-		      (Fcurrent_buffer (),
-		       make_number (search_regs.start[c - '0'] + offset),
-		       make_number (search_regs.end[c - '0'] + offset));
+		    idx = c - '0';
 		}
 	      else if (c == '\\')
-		insert_char (c);
+		add_len = 1, add_stuff = "\\";
 	      else
-		error ("Invalid use of `\\' in replacement text");
+		{
+		  xfree (substed);
+		  error ("Invalid use of `\\' in replacement text");
+		}
 	    }
 	  else
-	    insert_char (c);
+	    {
+	      add_len = CHAR_STRING (c, str);
+	      add_stuff = str;
+	    }
+
+	  /* If we want to copy part of a previous match,
+	     set up ADD_STUFF and ADD_LEN to point to it.  */
+	  if (idx >= 0)
+	    {
+	      int begbyte = CHAR_TO_BYTE (search_regs.start[idx]);
+	      add_len = CHAR_TO_BYTE (search_regs.end[idx]) - begbyte;
+	      if (search_regs.start[idx] < GPT && GPT < search_regs.end[idx])
+		move_gap (search_regs.start[idx]);
+	      add_stuff = BYTE_POS_ADDR (begbyte);
+	    }
+
+	  /* Now the stuff we want to add to SUBSTED
+	     is invariably ADD_LEN bytes starting at ADD_STUFF.  */
+
+	  /* Make sure SUBSTED is big enough.  */
+	  if (substed_len + add_len >= substed_alloc_size)
+	    {
+	      substed_alloc_size = substed_len + add_len + 500;
+	      substed = (unsigned char *) xrealloc (substed,
+						    substed_alloc_size + 1);
+	    }
+
+	  /* Now add to the end of SUBSTED.  */
+	  bcopy (add_stuff, substed + substed_len, add_len);
+	  substed_len += add_len;
 	}
-      UNGCPRO;
+
+      /* Now insert what we accumulated.  */
+      insert_and_inherit (substed, substed_len);
+
+      xfree (substed);
     }
 
   inslen = PT - (search_regs.start[sub]);
@@ -2565,19 +2675,19 @@ to hold all the values, and if INTEGERS is non-nil, no consing is done.")
   /* If REUSE is a list, store as many value elements as will fit
      into the elements of REUSE.  */
   for (i = 0, tail = reuse; CONSP (tail);
-       i++, tail = XCONS (tail)->cdr)
+       i++, tail = XCDR (tail))
     {
       if (i < 2 * len + 2)
-	XCONS (tail)->car = data[i];
+	XCAR (tail) = data[i];
       else
-	XCONS (tail)->car = Qnil;
+	XCAR (tail) = Qnil;
       prev = tail;
     }
 
   /* If we couldn't fit all value elements into REUSE,
      cons up the rest of them and add them to the end of REUSE.  */
   if (i < 2 * len + 2)
-    XCONS (prev)->cdr = Flist (2 * len + 2 - i, data + i);
+    XCDR (prev) = Flist (2 * len + 2 - i, data + i);
 
   return reuse;
 }