]> code.delx.au - gnu-emacs/blob - src/regex.c
(make-comint): Error, if start-process is not fboundp.
[gnu-emacs] / src / regex.c
1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1993, 1994 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
21
22 /* AIX requires this to be the first thing in the file. */
23 #if defined (_AIX) && !defined (REGEX_MALLOC)
24 #pragma alloca
25 #endif
26
27 #define _GNU_SOURCE
28
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 /* We need this for `regex.h', and perhaps for the Emacs include files. */
34 #include <sys/types.h>
35
36 /* This is for other GNU distributions with internationalized messages.
37 The GNU C Library itself does not yet support such messages. */
38 #if HAVE_LIBINTL_H
39 # include <libintl.h>
40 #else
41 # define gettext(msgid) (msgid)
42 #endif
43
44 /* The `emacs' switch turns on certain matching commands
45 that make sense only in Emacs. */
46 #ifdef emacs
47
48 #include "lisp.h"
49 #include "buffer.h"
50 #include "syntax.h"
51
52 #else /* not emacs */
53
54 #ifdef STDC_HEADERS
55 #include <stdlib.h>
56 #else
57 char *malloc ();
58 char *realloc ();
59 #endif
60
61
62 /* We used to test for `BSTRING' here, but only GCC and Emacs define
63 `BSTRING', as far as I know, and neither of them use this code. */
64 #ifndef INHIBIT_STRING_HEADER
65 #if HAVE_STRING_H || STDC_HEADERS
66 #include <string.h>
67 #ifndef bcmp
68 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
69 #endif
70 #ifndef bcopy
71 #define bcopy(s, d, n) memcpy ((d), (s), (n))
72 #endif
73 #ifndef bzero
74 #define bzero(s, n) memset ((s), 0, (n))
75 #endif
76 #else
77 #include <strings.h>
78 #endif
79 #endif
80
81 /* Define the syntax stuff for \<, \>, etc. */
82
83 /* This must be nonzero for the wordchar and notwordchar pattern
84 commands in re_match_2. */
85 #ifndef Sword
86 #define Sword 1
87 #endif
88
89 #ifdef SWITCH_ENUM_BUG
90 #define SWITCH_ENUM_CAST(x) ((int)(x))
91 #else
92 #define SWITCH_ENUM_CAST(x) (x)
93 #endif
94
95 #ifdef SYNTAX_TABLE
96
97 extern char *re_syntax_table;
98
99 #else /* not SYNTAX_TABLE */
100
101 /* How many characters in the character set. */
102 #define CHAR_SET_SIZE 256
103
104 static char re_syntax_table[CHAR_SET_SIZE];
105
106 static void
107 init_syntax_once ()
108 {
109 register int c;
110 static int done = 0;
111
112 if (done)
113 return;
114
115 bzero (re_syntax_table, sizeof re_syntax_table);
116
117 for (c = 'a'; c <= 'z'; c++)
118 re_syntax_table[c] = Sword;
119
120 for (c = 'A'; c <= 'Z'; c++)
121 re_syntax_table[c] = Sword;
122
123 for (c = '0'; c <= '9'; c++)
124 re_syntax_table[c] = Sword;
125
126 re_syntax_table['_'] = Sword;
127
128 done = 1;
129 }
130
131 #endif /* not SYNTAX_TABLE */
132
133 #define SYNTAX(c) re_syntax_table[c]
134
135 #endif /* not emacs */
136 \f
137 /* Get the interface, including the syntax bits. */
138 #include "regex.h"
139
140 /* isalpha etc. are used for the character classes. */
141 #include <ctype.h>
142
143 /* Jim Meyering writes:
144
145 "... Some ctype macros are valid only for character codes that
146 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
147 using /bin/cc or gcc but without giving an ansi option). So, all
148 ctype uses should be through macros like ISPRINT... If
149 STDC_HEADERS is defined, then autoconf has verified that the ctype
150 macros don't need to be guarded with references to isascii. ...
151 Defining isascii to 1 should let any compiler worth its salt
152 eliminate the && through constant folding." */
153
154 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
155 #define ISASCII(c) 1
156 #else
157 #define ISASCII(c) isascii(c)
158 #endif
159
160 #ifdef isblank
161 #define ISBLANK(c) (ISASCII (c) && isblank (c))
162 #else
163 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
164 #endif
165 #ifdef isgraph
166 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
167 #else
168 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
169 #endif
170
171 #define ISPRINT(c) (ISASCII (c) && isprint (c))
172 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
173 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
174 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
175 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
176 #define ISLOWER(c) (ISASCII (c) && islower (c))
177 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
178 #define ISSPACE(c) (ISASCII (c) && isspace (c))
179 #define ISUPPER(c) (ISASCII (c) && isupper (c))
180 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
181
182 #ifndef NULL
183 #define NULL 0
184 #endif
185
186 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
187 since ours (we hope) works properly with all combinations of
188 machines, compilers, `char' and `unsigned char' argument types.
189 (Per Bothner suggested the basic approach.) */
190 #undef SIGN_EXTEND_CHAR
191 #if __STDC__
192 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
193 #else /* not __STDC__ */
194 /* As in Harbison and Steele. */
195 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
196 #endif
197 \f
198 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
199 use `alloca' instead of `malloc'. This is because using malloc in
200 re_search* or re_match* could cause memory leaks when C-g is used in
201 Emacs; also, malloc is slower and causes storage fragmentation. On
202 the other hand, malloc is more portable, and easier to debug.
203
204 Because we sometimes use alloca, some routines have to be macros,
205 not functions -- `alloca'-allocated space disappears at the end of the
206 function it is called in. */
207
208 #ifdef REGEX_MALLOC
209
210 #define REGEX_ALLOCATE malloc
211 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
212
213 #else /* not REGEX_MALLOC */
214
215 /* Emacs already defines alloca, sometimes. */
216 #ifndef alloca
217
218 /* Make alloca work the best possible way. */
219 #ifdef __GNUC__
220 #define alloca __builtin_alloca
221 #else /* not __GNUC__ */
222 #if HAVE_ALLOCA_H
223 #include <alloca.h>
224 #else /* not __GNUC__ or HAVE_ALLOCA_H */
225 #ifndef _AIX /* Already did AIX, up at the top. */
226 char *alloca ();
227 #endif /* not _AIX */
228 #endif /* not HAVE_ALLOCA_H */
229 #endif /* not __GNUC__ */
230
231 #endif /* not alloca */
232
233 #define REGEX_ALLOCATE alloca
234
235 /* Assumes a `char *destination' variable. */
236 #define REGEX_REALLOCATE(source, osize, nsize) \
237 (destination = (char *) alloca (nsize), \
238 bcopy (source, destination, osize), \
239 destination)
240
241 #endif /* not REGEX_MALLOC */
242
243
244 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
245 `string1' or just past its end. This works if PTR is NULL, which is
246 a good thing. */
247 #define FIRST_STRING_P(ptr) \
248 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
249
250 /* (Re)Allocate N items of type T using malloc, or fail. */
251 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
252 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
253 #define RETALLOC_IF(addr, n, t) \
254 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
255 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
256
257 #define BYTEWIDTH 8 /* In bits. */
258
259 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
260
261 #undef MAX
262 #undef MIN
263 #define MAX(a, b) ((a) > (b) ? (a) : (b))
264 #define MIN(a, b) ((a) < (b) ? (a) : (b))
265
266 typedef char boolean;
267 #define false 0
268 #define true 1
269
270 static int re_match_2_internal ();
271 \f
272 /* These are the command codes that appear in compiled regular
273 expressions. Some opcodes are followed by argument bytes. A
274 command code can specify any interpretation whatsoever for its
275 arguments. Zero bytes may appear in the compiled regular expression. */
276
277 typedef enum
278 {
279 no_op = 0,
280
281 /* Succeed right away--no more backtracking. */
282 succeed,
283
284 /* Followed by one byte giving n, then by n literal bytes. */
285 exactn,
286
287 /* Matches any (more or less) character. */
288 anychar,
289
290 /* Matches any one char belonging to specified set. First
291 following byte is number of bitmap bytes. Then come bytes
292 for a bitmap saying which chars are in. Bits in each byte
293 are ordered low-bit-first. A character is in the set if its
294 bit is 1. A character too large to have a bit in the map is
295 automatically not in the set. */
296 charset,
297
298 /* Same parameters as charset, but match any character that is
299 not one of those specified. */
300 charset_not,
301
302 /* Start remembering the text that is matched, for storing in a
303 register. Followed by one byte with the register number, in
304 the range 0 to one less than the pattern buffer's re_nsub
305 field. Then followed by one byte with the number of groups
306 inner to this one. (This last has to be part of the
307 start_memory only because we need it in the on_failure_jump
308 of re_match_2.) */
309 start_memory,
310
311 /* Stop remembering the text that is matched and store it in a
312 memory register. Followed by one byte with the register
313 number, in the range 0 to one less than `re_nsub' in the
314 pattern buffer, and one byte with the number of inner groups,
315 just like `start_memory'. (We need the number of inner
316 groups here because we don't have any easy way of finding the
317 corresponding start_memory when we're at a stop_memory.) */
318 stop_memory,
319
320 /* Match a duplicate of something remembered. Followed by one
321 byte containing the register number. */
322 duplicate,
323
324 /* Fail unless at beginning of line. */
325 begline,
326
327 /* Fail unless at end of line. */
328 endline,
329
330 /* Succeeds if at beginning of buffer (if emacs) or at beginning
331 of string to be matched (if not). */
332 begbuf,
333
334 /* Analogously, for end of buffer/string. */
335 endbuf,
336
337 /* Followed by two byte relative address to which to jump. */
338 jump,
339
340 /* Same as jump, but marks the end of an alternative. */
341 jump_past_alt,
342
343 /* Followed by two-byte relative address of place to resume at
344 in case of failure. */
345 on_failure_jump,
346
347 /* Like on_failure_jump, but pushes a placeholder instead of the
348 current string position when executed. */
349 on_failure_keep_string_jump,
350
351 /* Throw away latest failure point and then jump to following
352 two-byte relative address. */
353 pop_failure_jump,
354
355 /* Change to pop_failure_jump if know won't have to backtrack to
356 match; otherwise change to jump. This is used to jump
357 back to the beginning of a repeat. If what follows this jump
358 clearly won't match what the repeat does, such that we can be
359 sure that there is no use backtracking out of repetitions
360 already matched, then we change it to a pop_failure_jump.
361 Followed by two-byte address. */
362 maybe_pop_jump,
363
364 /* Jump to following two-byte address, and push a dummy failure
365 point. This failure point will be thrown away if an attempt
366 is made to use it for a failure. A `+' construct makes this
367 before the first repeat. Also used as an intermediary kind
368 of jump when compiling an alternative. */
369 dummy_failure_jump,
370
371 /* Push a dummy failure point and continue. Used at the end of
372 alternatives. */
373 push_dummy_failure,
374
375 /* Followed by two-byte relative address and two-byte number n.
376 After matching N times, jump to the address upon failure. */
377 succeed_n,
378
379 /* Followed by two-byte relative address, and two-byte number n.
380 Jump to the address N times, then fail. */
381 jump_n,
382
383 /* Set the following two-byte relative address to the
384 subsequent two-byte number. The address *includes* the two
385 bytes of number. */
386 set_number_at,
387
388 wordchar, /* Matches any word-constituent character. */
389 notwordchar, /* Matches any char that is not a word-constituent. */
390
391 wordbeg, /* Succeeds if at word beginning. */
392 wordend, /* Succeeds if at word end. */
393
394 wordbound, /* Succeeds if at a word boundary. */
395 notwordbound /* Succeeds if not at a word boundary. */
396
397 #ifdef emacs
398 ,before_dot, /* Succeeds if before point. */
399 at_dot, /* Succeeds if at point. */
400 after_dot, /* Succeeds if after point. */
401
402 /* Matches any character whose syntax is specified. Followed by
403 a byte which contains a syntax code, e.g., Sword. */
404 syntaxspec,
405
406 /* Matches any character whose syntax is not that specified. */
407 notsyntaxspec
408 #endif /* emacs */
409 } re_opcode_t;
410 \f
411 /* Common operations on the compiled pattern. */
412
413 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
414
415 #define STORE_NUMBER(destination, number) \
416 do { \
417 (destination)[0] = (number) & 0377; \
418 (destination)[1] = (number) >> 8; \
419 } while (0)
420
421 /* Same as STORE_NUMBER, except increment DESTINATION to
422 the byte after where the number is stored. Therefore, DESTINATION
423 must be an lvalue. */
424
425 #define STORE_NUMBER_AND_INCR(destination, number) \
426 do { \
427 STORE_NUMBER (destination, number); \
428 (destination) += 2; \
429 } while (0)
430
431 /* Put into DESTINATION a number stored in two contiguous bytes starting
432 at SOURCE. */
433
434 #define EXTRACT_NUMBER(destination, source) \
435 do { \
436 (destination) = *(source) & 0377; \
437 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
438 } while (0)
439
440 #ifdef DEBUG
441 static void
442 extract_number (dest, source)
443 int *dest;
444 unsigned char *source;
445 {
446 int temp = SIGN_EXTEND_CHAR (*(source + 1));
447 *dest = *source & 0377;
448 *dest += temp << 8;
449 }
450
451 #ifndef EXTRACT_MACROS /* To debug the macros. */
452 #undef EXTRACT_NUMBER
453 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
454 #endif /* not EXTRACT_MACROS */
455
456 #endif /* DEBUG */
457
458 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
459 SOURCE must be an lvalue. */
460
461 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
462 do { \
463 EXTRACT_NUMBER (destination, source); \
464 (source) += 2; \
465 } while (0)
466
467 #ifdef DEBUG
468 static void
469 extract_number_and_incr (destination, source)
470 int *destination;
471 unsigned char **source;
472 {
473 extract_number (destination, *source);
474 *source += 2;
475 }
476
477 #ifndef EXTRACT_MACROS
478 #undef EXTRACT_NUMBER_AND_INCR
479 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
480 extract_number_and_incr (&dest, &src)
481 #endif /* not EXTRACT_MACROS */
482
483 #endif /* DEBUG */
484 \f
485 /* If DEBUG is defined, Regex prints many voluminous messages about what
486 it is doing (if the variable `debug' is nonzero). If linked with the
487 main program in `iregex.c', you can enter patterns and strings
488 interactively. And if linked with the main program in `main.c' and
489 the other test files, you can run the already-written tests. */
490
491 #ifdef DEBUG
492
493 /* We use standard I/O for debugging. */
494 #include <stdio.h>
495
496 /* It is useful to test things that ``must'' be true when debugging. */
497 #include <assert.h>
498
499 static int debug = 0;
500
501 #define DEBUG_STATEMENT(e) e
502 #define DEBUG_PRINT1(x) if (debug) printf (x)
503 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
504 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
505 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
506 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
507 if (debug) print_partial_compiled_pattern (s, e)
508 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
509 if (debug) print_double_string (w, s1, sz1, s2, sz2)
510
511
512 /* Print the fastmap in human-readable form. */
513
514 void
515 print_fastmap (fastmap)
516 char *fastmap;
517 {
518 unsigned was_a_range = 0;
519 unsigned i = 0;
520
521 while (i < (1 << BYTEWIDTH))
522 {
523 if (fastmap[i++])
524 {
525 was_a_range = 0;
526 putchar (i - 1);
527 while (i < (1 << BYTEWIDTH) && fastmap[i])
528 {
529 was_a_range = 1;
530 i++;
531 }
532 if (was_a_range)
533 {
534 printf ("-");
535 putchar (i - 1);
536 }
537 }
538 }
539 putchar ('\n');
540 }
541
542
543 /* Print a compiled pattern string in human-readable form, starting at
544 the START pointer into it and ending just before the pointer END. */
545
546 void
547 print_partial_compiled_pattern (start, end)
548 unsigned char *start;
549 unsigned char *end;
550 {
551 int mcnt, mcnt2;
552 unsigned char *p = start;
553 unsigned char *pend = end;
554
555 if (start == NULL)
556 {
557 printf ("(null)\n");
558 return;
559 }
560
561 /* Loop over pattern commands. */
562 while (p < pend)
563 {
564 printf ("%d:\t", p - start);
565
566 switch ((re_opcode_t) *p++)
567 {
568 case no_op:
569 printf ("/no_op");
570 break;
571
572 case exactn:
573 mcnt = *p++;
574 printf ("/exactn/%d", mcnt);
575 do
576 {
577 putchar ('/');
578 putchar (*p++);
579 }
580 while (--mcnt);
581 break;
582
583 case start_memory:
584 mcnt = *p++;
585 printf ("/start_memory/%d/%d", mcnt, *p++);
586 break;
587
588 case stop_memory:
589 mcnt = *p++;
590 printf ("/stop_memory/%d/%d", mcnt, *p++);
591 break;
592
593 case duplicate:
594 printf ("/duplicate/%d", *p++);
595 break;
596
597 case anychar:
598 printf ("/anychar");
599 break;
600
601 case charset:
602 case charset_not:
603 {
604 register int c, last = -100;
605 register int in_range = 0;
606
607 printf ("/charset [%s",
608 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
609
610 assert (p + *p < pend);
611
612 for (c = 0; c < 256; c++)
613 if (c / 8 < *p
614 && (p[1 + (c/8)] & (1 << (c % 8))))
615 {
616 /* Are we starting a range? */
617 if (last + 1 == c && ! in_range)
618 {
619 putchar ('-');
620 in_range = 1;
621 }
622 /* Have we broken a range? */
623 else if (last + 1 != c && in_range)
624 {
625 putchar (last);
626 in_range = 0;
627 }
628
629 if (! in_range)
630 putchar (c);
631
632 last = c;
633 }
634
635 if (in_range)
636 putchar (last);
637
638 putchar (']');
639
640 p += 1 + *p;
641 }
642 break;
643
644 case begline:
645 printf ("/begline");
646 break;
647
648 case endline:
649 printf ("/endline");
650 break;
651
652 case on_failure_jump:
653 extract_number_and_incr (&mcnt, &p);
654 printf ("/on_failure_jump to %d", p + mcnt - start);
655 break;
656
657 case on_failure_keep_string_jump:
658 extract_number_and_incr (&mcnt, &p);
659 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
660 break;
661
662 case dummy_failure_jump:
663 extract_number_and_incr (&mcnt, &p);
664 printf ("/dummy_failure_jump to %d", p + mcnt - start);
665 break;
666
667 case push_dummy_failure:
668 printf ("/push_dummy_failure");
669 break;
670
671 case maybe_pop_jump:
672 extract_number_and_incr (&mcnt, &p);
673 printf ("/maybe_pop_jump to %d", p + mcnt - start);
674 break;
675
676 case pop_failure_jump:
677 extract_number_and_incr (&mcnt, &p);
678 printf ("/pop_failure_jump to %d", p + mcnt - start);
679 break;
680
681 case jump_past_alt:
682 extract_number_and_incr (&mcnt, &p);
683 printf ("/jump_past_alt to %d", p + mcnt - start);
684 break;
685
686 case jump:
687 extract_number_and_incr (&mcnt, &p);
688 printf ("/jump to %d", p + mcnt - start);
689 break;
690
691 case succeed_n:
692 extract_number_and_incr (&mcnt, &p);
693 extract_number_and_incr (&mcnt2, &p);
694 printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2);
695 break;
696
697 case jump_n:
698 extract_number_and_incr (&mcnt, &p);
699 extract_number_and_incr (&mcnt2, &p);
700 printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2);
701 break;
702
703 case set_number_at:
704 extract_number_and_incr (&mcnt, &p);
705 extract_number_and_incr (&mcnt2, &p);
706 printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2);
707 break;
708
709 case wordbound:
710 printf ("/wordbound");
711 break;
712
713 case notwordbound:
714 printf ("/notwordbound");
715 break;
716
717 case wordbeg:
718 printf ("/wordbeg");
719 break;
720
721 case wordend:
722 printf ("/wordend");
723
724 #ifdef emacs
725 case before_dot:
726 printf ("/before_dot");
727 break;
728
729 case at_dot:
730 printf ("/at_dot");
731 break;
732
733 case after_dot:
734 printf ("/after_dot");
735 break;
736
737 case syntaxspec:
738 printf ("/syntaxspec");
739 mcnt = *p++;
740 printf ("/%d", mcnt);
741 break;
742
743 case notsyntaxspec:
744 printf ("/notsyntaxspec");
745 mcnt = *p++;
746 printf ("/%d", mcnt);
747 break;
748 #endif /* emacs */
749
750 case wordchar:
751 printf ("/wordchar");
752 break;
753
754 case notwordchar:
755 printf ("/notwordchar");
756 break;
757
758 case begbuf:
759 printf ("/begbuf");
760 break;
761
762 case endbuf:
763 printf ("/endbuf");
764 break;
765
766 default:
767 printf ("?%d", *(p-1));
768 }
769
770 putchar ('\n');
771 }
772
773 printf ("%d:\tend of pattern.\n", p - start);
774 }
775
776
777 void
778 print_compiled_pattern (bufp)
779 struct re_pattern_buffer *bufp;
780 {
781 unsigned char *buffer = bufp->buffer;
782
783 print_partial_compiled_pattern (buffer, buffer + bufp->used);
784 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
785
786 if (bufp->fastmap_accurate && bufp->fastmap)
787 {
788 printf ("fastmap: ");
789 print_fastmap (bufp->fastmap);
790 }
791
792 printf ("re_nsub: %d\t", bufp->re_nsub);
793 printf ("regs_alloc: %d\t", bufp->regs_allocated);
794 printf ("can_be_null: %d\t", bufp->can_be_null);
795 printf ("newline_anchor: %d\n", bufp->newline_anchor);
796 printf ("no_sub: %d\t", bufp->no_sub);
797 printf ("not_bol: %d\t", bufp->not_bol);
798 printf ("not_eol: %d\t", bufp->not_eol);
799 printf ("syntax: %d\n", bufp->syntax);
800 /* Perhaps we should print the translate table? */
801 }
802
803
804 void
805 print_double_string (where, string1, size1, string2, size2)
806 const char *where;
807 const char *string1;
808 const char *string2;
809 int size1;
810 int size2;
811 {
812 unsigned this_char;
813
814 if (where == NULL)
815 printf ("(null)");
816 else
817 {
818 if (FIRST_STRING_P (where))
819 {
820 for (this_char = where - string1; this_char < size1; this_char++)
821 putchar (string1[this_char]);
822
823 where = string2;
824 }
825
826 for (this_char = where - string2; this_char < size2; this_char++)
827 putchar (string2[this_char]);
828 }
829 }
830
831 #else /* not DEBUG */
832
833 #undef assert
834 #define assert(e)
835
836 #define DEBUG_STATEMENT(e)
837 #define DEBUG_PRINT1(x)
838 #define DEBUG_PRINT2(x1, x2)
839 #define DEBUG_PRINT3(x1, x2, x3)
840 #define DEBUG_PRINT4(x1, x2, x3, x4)
841 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
842 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
843
844 #endif /* not DEBUG */
845 \f
846 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
847 also be assigned to arbitrarily: each pattern buffer stores its own
848 syntax, so it can be changed between regex compilations. */
849 /* This has no initializer because initialized variables in Emacs
850 become read-only after dumping. */
851 reg_syntax_t re_syntax_options;
852
853
854 /* Specify the precise syntax of regexps for compilation. This provides
855 for compatibility for various utilities which historically have
856 different, incompatible syntaxes.
857
858 The argument SYNTAX is a bit mask comprised of the various bits
859 defined in regex.h. We return the old syntax. */
860
861 reg_syntax_t
862 re_set_syntax (syntax)
863 reg_syntax_t syntax;
864 {
865 reg_syntax_t ret = re_syntax_options;
866
867 re_syntax_options = syntax;
868 return ret;
869 }
870 \f
871 /* This table gives an error message for each of the error codes listed
872 in regex.h. Obviously the order here has to be same as there.
873 POSIX doesn't require that we do anything for REG_NOERROR,
874 but why not be nice? */
875
876 static const char *re_error_msgid[] =
877 { "Success", /* REG_NOERROR */
878 "No match", /* REG_NOMATCH */
879 "Invalid regular expression", /* REG_BADPAT */
880 "Invalid collation character", /* REG_ECOLLATE */
881 "Invalid character class name", /* REG_ECTYPE */
882 "Trailing backslash", /* REG_EESCAPE */
883 "Invalid back reference", /* REG_ESUBREG */
884 "Unmatched [ or [^", /* REG_EBRACK */
885 "Unmatched ( or \\(", /* REG_EPAREN */
886 "Unmatched \\{", /* REG_EBRACE */
887 "Invalid content of \\{\\}", /* REG_BADBR */
888 "Invalid range end", /* REG_ERANGE */
889 "Memory exhausted", /* REG_ESPACE */
890 "Invalid preceding regular expression", /* REG_BADRPT */
891 "Premature end of regular expression", /* REG_EEND */
892 "Regular expression too big", /* REG_ESIZE */
893 "Unmatched ) or \\)", /* REG_ERPAREN */
894 };
895 \f
896 /* Avoiding alloca during matching, to placate r_alloc. */
897
898 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
899 searching and matching functions should not call alloca. On some
900 systems, alloca is implemented in terms of malloc, and if we're
901 using the relocating allocator routines, then malloc could cause a
902 relocation, which might (if the strings being searched are in the
903 ralloc heap) shift the data out from underneath the regexp
904 routines.
905
906 Here's another reason to avoid allocation: Emacs
907 processes input from X in a signal handler; processing X input may
908 call malloc; if input arrives while a matching routine is calling
909 malloc, then we're scrod. But Emacs can't just block input while
910 calling matching routines; then we don't notice interrupts when
911 they come in. So, Emacs blocks input around all regexp calls
912 except the matching calls, which it leaves unprotected, in the
913 faith that they will not malloc. */
914
915 /* Normally, this is fine. */
916 #define MATCH_MAY_ALLOCATE
917
918 /* The match routines may not allocate if (1) they would do it with malloc
919 and (2) it's not safe for them to use malloc. */
920 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && (defined (emacs) || defined (REL_ALLOC))
921 #undef MATCH_MAY_ALLOCATE
922 #endif
923
924 \f
925 /* Failure stack declarations and macros; both re_compile_fastmap and
926 re_match_2 use a failure stack. These have to be macros because of
927 REGEX_ALLOCATE. */
928
929
930 /* Number of failure points for which to initially allocate space
931 when matching. If this number is exceeded, we allocate more
932 space, so it is not a hard limit. */
933 #ifndef INIT_FAILURE_ALLOC
934 #define INIT_FAILURE_ALLOC 5
935 #endif
936
937 /* Roughly the maximum number of failure points on the stack. Would be
938 exactly that if always used MAX_FAILURE_SPACE each time we failed.
939 This is a variable only so users of regex can assign to it; we never
940 change it ourselves. */
941 int re_max_failures = 2000;
942
943 typedef unsigned char *fail_stack_elt_t;
944
945 typedef struct
946 {
947 fail_stack_elt_t *stack;
948 unsigned size;
949 unsigned avail; /* Offset of next open position. */
950 } fail_stack_type;
951
952 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
953 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
954 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
955 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail])
956
957
958 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */
959
960 #ifdef MATCH_MAY_ALLOCATE
961 #define INIT_FAIL_STACK() \
962 do { \
963 fail_stack.stack = (fail_stack_elt_t *) \
964 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
965 \
966 if (fail_stack.stack == NULL) \
967 return -2; \
968 \
969 fail_stack.size = INIT_FAILURE_ALLOC; \
970 fail_stack.avail = 0; \
971 } while (0)
972 #else
973 #define INIT_FAIL_STACK() \
974 do { \
975 fail_stack.avail = 0; \
976 } while (0)
977 #endif
978
979
980 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
981
982 Return 1 if succeeds, and 0 if either ran out of memory
983 allocating space for it or it was already too large.
984
985 REGEX_REALLOCATE requires `destination' be declared. */
986
987 #define DOUBLE_FAIL_STACK(fail_stack) \
988 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
989 ? 0 \
990 : ((fail_stack).stack = (fail_stack_elt_t *) \
991 REGEX_REALLOCATE ((fail_stack).stack, \
992 (fail_stack).size * sizeof (fail_stack_elt_t), \
993 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
994 \
995 (fail_stack).stack == NULL \
996 ? 0 \
997 : ((fail_stack).size <<= 1, \
998 1)))
999
1000
1001 /* Push PATTERN_OP on FAIL_STACK.
1002
1003 Return 1 if was able to do so and 0 if ran out of memory allocating
1004 space to do so. */
1005 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \
1006 ((FAIL_STACK_FULL () \
1007 && !DOUBLE_FAIL_STACK (fail_stack)) \
1008 ? 0 \
1009 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \
1010 1))
1011
1012 /* This pushes an item onto the failure stack. Must be a four-byte
1013 value. Assumes the variable `fail_stack'. Probably should only
1014 be called from within `PUSH_FAILURE_POINT'. */
1015 #define PUSH_FAILURE_ITEM(item) \
1016 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item
1017
1018 /* The complement operation. Assumes `fail_stack' is nonempty. */
1019 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail]
1020
1021 /* Used to omit pushing failure point id's when we're not debugging. */
1022 #ifdef DEBUG
1023 #define DEBUG_PUSH PUSH_FAILURE_ITEM
1024 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM ()
1025 #else
1026 #define DEBUG_PUSH(item)
1027 #define DEBUG_POP(item_addr)
1028 #endif
1029
1030
1031 /* Push the information about the state we will need
1032 if we ever fail back to it.
1033
1034 Requires variables fail_stack, regstart, regend, reg_info, and
1035 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1036 declared.
1037
1038 Does `return FAILURE_CODE' if runs out of memory. */
1039
1040 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1041 do { \
1042 char *destination; \
1043 /* Must be int, so when we don't save any registers, the arithmetic \
1044 of 0 + -1 isn't done as unsigned. */ \
1045 int this_reg; \
1046 \
1047 DEBUG_STATEMENT (failure_id++); \
1048 DEBUG_STATEMENT (nfailure_points_pushed++); \
1049 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1050 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
1051 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1052 \
1053 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1054 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
1055 \
1056 /* Ensure we have enough space allocated for what we will push. */ \
1057 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
1058 { \
1059 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1060 return failure_code; \
1061 \
1062 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
1063 (fail_stack).size); \
1064 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1065 } \
1066 \
1067 /* Push the info, starting with the registers. */ \
1068 DEBUG_PRINT1 ("\n"); \
1069 \
1070 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
1071 this_reg++) \
1072 { \
1073 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1074 DEBUG_STATEMENT (num_regs_pushed++); \
1075 \
1076 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1077 PUSH_FAILURE_ITEM (regstart[this_reg]); \
1078 \
1079 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1080 PUSH_FAILURE_ITEM (regend[this_reg]); \
1081 \
1082 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
1083 DEBUG_PRINT2 (" match_null=%d", \
1084 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
1085 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
1086 DEBUG_PRINT2 (" matched_something=%d", \
1087 MATCHED_SOMETHING (reg_info[this_reg])); \
1088 DEBUG_PRINT2 (" ever_matched=%d", \
1089 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
1090 DEBUG_PRINT1 ("\n"); \
1091 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \
1092 } \
1093 \
1094 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
1095 PUSH_FAILURE_ITEM (lowest_active_reg); \
1096 \
1097 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
1098 PUSH_FAILURE_ITEM (highest_active_reg); \
1099 \
1100 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
1101 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
1102 PUSH_FAILURE_ITEM (pattern_place); \
1103 \
1104 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
1105 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
1106 size2); \
1107 DEBUG_PRINT1 ("'\n"); \
1108 PUSH_FAILURE_ITEM (string_place); \
1109 \
1110 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
1111 DEBUG_PUSH (failure_id); \
1112 } while (0)
1113
1114 /* This is the number of items that are pushed and popped on the stack
1115 for each register. */
1116 #define NUM_REG_ITEMS 3
1117
1118 /* Individual items aside from the registers. */
1119 #ifdef DEBUG
1120 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1121 #else
1122 #define NUM_NONREG_ITEMS 4
1123 #endif
1124
1125 /* We push at most this many items on the stack. */
1126 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
1127
1128 /* We actually push this many items. */
1129 #define NUM_FAILURE_ITEMS \
1130 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1131 + NUM_NONREG_ITEMS)
1132
1133 /* How many items can still be added to the stack without overflowing it. */
1134 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1135
1136
1137 /* Pops what PUSH_FAIL_STACK pushes.
1138
1139 We restore into the parameters, all of which should be lvalues:
1140 STR -- the saved data position.
1141 PAT -- the saved pattern position.
1142 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1143 REGSTART, REGEND -- arrays of string positions.
1144 REG_INFO -- array of information about each subexpression.
1145
1146 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1147 `pend', `string1', `size1', `string2', and `size2'. */
1148
1149 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
1150 { \
1151 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
1152 int this_reg; \
1153 const unsigned char *string_temp; \
1154 \
1155 assert (!FAIL_STACK_EMPTY ()); \
1156 \
1157 /* Remove failure points and point to how many regs pushed. */ \
1158 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1159 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1160 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1161 \
1162 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1163 \
1164 DEBUG_POP (&failure_id); \
1165 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
1166 \
1167 /* If the saved string location is NULL, it came from an \
1168 on_failure_keep_string_jump opcode, and we want to throw away the \
1169 saved NULL, thus retaining our current position in the string. */ \
1170 string_temp = POP_FAILURE_ITEM (); \
1171 if (string_temp != NULL) \
1172 str = (const char *) string_temp; \
1173 \
1174 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
1175 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1176 DEBUG_PRINT1 ("'\n"); \
1177 \
1178 pat = (unsigned char *) POP_FAILURE_ITEM (); \
1179 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
1180 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1181 \
1182 /* Restore register info. */ \
1183 high_reg = (unsigned) POP_FAILURE_ITEM (); \
1184 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1185 \
1186 low_reg = (unsigned) POP_FAILURE_ITEM (); \
1187 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1188 \
1189 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
1190 { \
1191 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1192 \
1193 reg_info[this_reg].word = POP_FAILURE_ITEM (); \
1194 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
1195 \
1196 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \
1197 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
1198 \
1199 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
1200 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
1201 } \
1202 \
1203 set_regs_matched_done = 0; \
1204 DEBUG_STATEMENT (nfailure_points_popped++); \
1205 } /* POP_FAILURE_POINT */
1206
1207
1208 \f
1209 /* Structure for per-register (a.k.a. per-group) information.
1210 This must not be longer than one word, because we push this value
1211 onto the failure stack. Other register information, such as the
1212 starting and ending positions (which are addresses), and the list of
1213 inner groups (which is a bits list) are maintained in separate
1214 variables.
1215
1216 We are making a (strictly speaking) nonportable assumption here: that
1217 the compiler will pack our bit fields into something that fits into
1218 the type of `word', i.e., is something that fits into one item on the
1219 failure stack. */
1220 typedef union
1221 {
1222 fail_stack_elt_t word;
1223 struct
1224 {
1225 /* This field is one if this group can match the empty string,
1226 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1227 #define MATCH_NULL_UNSET_VALUE 3
1228 unsigned match_null_string_p : 2;
1229 unsigned is_active : 1;
1230 unsigned matched_something : 1;
1231 unsigned ever_matched_something : 1;
1232 } bits;
1233 } register_info_type;
1234
1235 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1236 #define IS_ACTIVE(R) ((R).bits.is_active)
1237 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1238 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1239
1240
1241 /* Call this when have matched a real character; it sets `matched' flags
1242 for the subexpressions which we are currently inside. Also records
1243 that those subexprs have matched. */
1244 #define SET_REGS_MATCHED() \
1245 do \
1246 { \
1247 if (!set_regs_matched_done) \
1248 { \
1249 unsigned r; \
1250 set_regs_matched_done = 1; \
1251 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1252 { \
1253 MATCHED_SOMETHING (reg_info[r]) \
1254 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1255 = 1; \
1256 } \
1257 } \
1258 } \
1259 while (0)
1260
1261 /* Registers are set to a sentinel when they haven't yet matched. */
1262 static char reg_unset_dummy;
1263 #define REG_UNSET_VALUE (&reg_unset_dummy)
1264 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1265
1266
1267 \f
1268 /* How do we implement a missing MATCH_MAY_ALLOCATE?
1269 We make the fail stack a global thing, and then grow it to
1270 re_max_failures when we compile. */
1271 #ifndef MATCH_MAY_ALLOCATE
1272 static fail_stack_type fail_stack;
1273
1274 static const char ** regstart, ** regend;
1275 static const char ** old_regstart, ** old_regend;
1276 static const char **best_regstart, **best_regend;
1277 static register_info_type *reg_info;
1278 static const char **reg_dummy;
1279 static register_info_type *reg_info_dummy;
1280 #endif
1281
1282 \f
1283 /* Subroutine declarations and macros for regex_compile. */
1284
1285 static void store_op1 (), store_op2 ();
1286 static void insert_op1 (), insert_op2 ();
1287 static boolean at_begline_loc_p (), at_endline_loc_p ();
1288 static boolean group_in_compile_stack ();
1289 static reg_errcode_t compile_range ();
1290
1291 /* Fetch the next character in the uncompiled pattern---translating it
1292 if necessary. Also cast from a signed character in the constant
1293 string passed to us by the user to an unsigned char that we can use
1294 as an array index (in, e.g., `translate'). */
1295 #define PATFETCH(c) \
1296 do {if (p == pend) return REG_EEND; \
1297 c = (unsigned char) *p++; \
1298 if (translate) c = translate[c]; \
1299 } while (0)
1300
1301 /* Fetch the next character in the uncompiled pattern, with no
1302 translation. */
1303 #define PATFETCH_RAW(c) \
1304 do {if (p == pend) return REG_EEND; \
1305 c = (unsigned char) *p++; \
1306 } while (0)
1307
1308 /* Go backwards one character in the pattern. */
1309 #define PATUNFETCH p--
1310
1311
1312 /* If `translate' is non-null, return translate[D], else just D. We
1313 cast the subscript to translate because some data is declared as
1314 `char *', to avoid warnings when a string constant is passed. But
1315 when we use a character as a subscript we must make it unsigned. */
1316 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
1317
1318
1319 /* Macros for outputting the compiled pattern into `buffer'. */
1320
1321 /* If the buffer isn't allocated when it comes in, use this. */
1322 #define INIT_BUF_SIZE 32
1323
1324 /* Make sure we have at least N more bytes of space in buffer. */
1325 #define GET_BUFFER_SPACE(n) \
1326 while (b - bufp->buffer + (n) > bufp->allocated) \
1327 EXTEND_BUFFER ()
1328
1329 /* Make sure we have one more byte of buffer space and then add C to it. */
1330 #define BUF_PUSH(c) \
1331 do { \
1332 GET_BUFFER_SPACE (1); \
1333 *b++ = (unsigned char) (c); \
1334 } while (0)
1335
1336
1337 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1338 #define BUF_PUSH_2(c1, c2) \
1339 do { \
1340 GET_BUFFER_SPACE (2); \
1341 *b++ = (unsigned char) (c1); \
1342 *b++ = (unsigned char) (c2); \
1343 } while (0)
1344
1345
1346 /* As with BUF_PUSH_2, except for three bytes. */
1347 #define BUF_PUSH_3(c1, c2, c3) \
1348 do { \
1349 GET_BUFFER_SPACE (3); \
1350 *b++ = (unsigned char) (c1); \
1351 *b++ = (unsigned char) (c2); \
1352 *b++ = (unsigned char) (c3); \
1353 } while (0)
1354
1355
1356 /* Store a jump with opcode OP at LOC to location TO. We store a
1357 relative address offset by the three bytes the jump itself occupies. */
1358 #define STORE_JUMP(op, loc, to) \
1359 store_op1 (op, loc, (to) - (loc) - 3)
1360
1361 /* Likewise, for a two-argument jump. */
1362 #define STORE_JUMP2(op, loc, to, arg) \
1363 store_op2 (op, loc, (to) - (loc) - 3, arg)
1364
1365 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1366 #define INSERT_JUMP(op, loc, to) \
1367 insert_op1 (op, loc, (to) - (loc) - 3, b)
1368
1369 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1370 #define INSERT_JUMP2(op, loc, to, arg) \
1371 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1372
1373
1374 /* This is not an arbitrary limit: the arguments which represent offsets
1375 into the pattern are two bytes long. So if 2^16 bytes turns out to
1376 be too small, many things would have to change. */
1377 #define MAX_BUF_SIZE (1L << 16)
1378
1379
1380 /* Extend the buffer by twice its current size via realloc and
1381 reset the pointers that pointed into the old block to point to the
1382 correct places in the new one. If extending the buffer results in it
1383 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1384 #define EXTEND_BUFFER() \
1385 do { \
1386 unsigned char *old_buffer = bufp->buffer; \
1387 if (bufp->allocated == MAX_BUF_SIZE) \
1388 return REG_ESIZE; \
1389 bufp->allocated <<= 1; \
1390 if (bufp->allocated > MAX_BUF_SIZE) \
1391 bufp->allocated = MAX_BUF_SIZE; \
1392 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1393 if (bufp->buffer == NULL) \
1394 return REG_ESPACE; \
1395 /* If the buffer moved, move all the pointers into it. */ \
1396 if (old_buffer != bufp->buffer) \
1397 { \
1398 b = (b - old_buffer) + bufp->buffer; \
1399 begalt = (begalt - old_buffer) + bufp->buffer; \
1400 if (fixup_alt_jump) \
1401 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1402 if (laststart) \
1403 laststart = (laststart - old_buffer) + bufp->buffer; \
1404 if (pending_exact) \
1405 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1406 } \
1407 } while (0)
1408
1409
1410 /* Since we have one byte reserved for the register number argument to
1411 {start,stop}_memory, the maximum number of groups we can report
1412 things about is what fits in that byte. */
1413 #define MAX_REGNUM 255
1414
1415 /* But patterns can have more than `MAX_REGNUM' registers. We just
1416 ignore the excess. */
1417 typedef unsigned regnum_t;
1418
1419
1420 /* Macros for the compile stack. */
1421
1422 /* Since offsets can go either forwards or backwards, this type needs to
1423 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1424 typedef int pattern_offset_t;
1425
1426 typedef struct
1427 {
1428 pattern_offset_t begalt_offset;
1429 pattern_offset_t fixup_alt_jump;
1430 pattern_offset_t inner_group_offset;
1431 pattern_offset_t laststart_offset;
1432 regnum_t regnum;
1433 } compile_stack_elt_t;
1434
1435
1436 typedef struct
1437 {
1438 compile_stack_elt_t *stack;
1439 unsigned size;
1440 unsigned avail; /* Offset of next open position. */
1441 } compile_stack_type;
1442
1443
1444 #define INIT_COMPILE_STACK_SIZE 32
1445
1446 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1447 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1448
1449 /* The next available element. */
1450 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1451
1452
1453 /* Set the bit for character C in a list. */
1454 #define SET_LIST_BIT(c) \
1455 (b[((unsigned char) (c)) / BYTEWIDTH] \
1456 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1457
1458
1459 /* Get the next unsigned number in the uncompiled pattern. */
1460 #define GET_UNSIGNED_NUMBER(num) \
1461 { if (p != pend) \
1462 { \
1463 PATFETCH (c); \
1464 while (ISDIGIT (c)) \
1465 { \
1466 if (num < 0) \
1467 num = 0; \
1468 num = num * 10 + c - '0'; \
1469 if (p == pend) \
1470 break; \
1471 PATFETCH (c); \
1472 } \
1473 } \
1474 }
1475
1476 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1477
1478 #define IS_CHAR_CLASS(string) \
1479 (STREQ (string, "alpha") || STREQ (string, "upper") \
1480 || STREQ (string, "lower") || STREQ (string, "digit") \
1481 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1482 || STREQ (string, "space") || STREQ (string, "print") \
1483 || STREQ (string, "punct") || STREQ (string, "graph") \
1484 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1485 \f
1486 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1487 Returns one of error codes defined in `regex.h', or zero for success.
1488
1489 Assumes the `allocated' (and perhaps `buffer') and `translate'
1490 fields are set in BUFP on entry.
1491
1492 If it succeeds, results are put in BUFP (if it returns an error, the
1493 contents of BUFP are undefined):
1494 `buffer' is the compiled pattern;
1495 `syntax' is set to SYNTAX;
1496 `used' is set to the length of the compiled pattern;
1497 `fastmap_accurate' is zero;
1498 `re_nsub' is the number of subexpressions in PATTERN;
1499 `not_bol' and `not_eol' are zero;
1500
1501 The `fastmap' and `newline_anchor' fields are neither
1502 examined nor set. */
1503
1504 /* Return, freeing storage we allocated. */
1505 #define FREE_STACK_RETURN(value) \
1506 return (free (compile_stack.stack), value)
1507
1508 static reg_errcode_t
1509 regex_compile (pattern, size, syntax, bufp)
1510 const char *pattern;
1511 int size;
1512 reg_syntax_t syntax;
1513 struct re_pattern_buffer *bufp;
1514 {
1515 /* We fetch characters from PATTERN here. Even though PATTERN is
1516 `char *' (i.e., signed), we declare these variables as unsigned, so
1517 they can be reliably used as array indices. */
1518 register unsigned char c, c1;
1519
1520 /* A random temporary spot in PATTERN. */
1521 const char *p1;
1522
1523 /* Points to the end of the buffer, where we should append. */
1524 register unsigned char *b;
1525
1526 /* Keeps track of unclosed groups. */
1527 compile_stack_type compile_stack;
1528
1529 /* Points to the current (ending) position in the pattern. */
1530 const char *p = pattern;
1531 const char *pend = pattern + size;
1532
1533 /* How to translate the characters in the pattern. */
1534 char *translate = bufp->translate;
1535
1536 /* Address of the count-byte of the most recently inserted `exactn'
1537 command. This makes it possible to tell if a new exact-match
1538 character can be added to that command or if the character requires
1539 a new `exactn' command. */
1540 unsigned char *pending_exact = 0;
1541
1542 /* Address of start of the most recently finished expression.
1543 This tells, e.g., postfix * where to find the start of its
1544 operand. Reset at the beginning of groups and alternatives. */
1545 unsigned char *laststart = 0;
1546
1547 /* Address of beginning of regexp, or inside of last group. */
1548 unsigned char *begalt;
1549
1550 /* Place in the uncompiled pattern (i.e., the {) to
1551 which to go back if the interval is invalid. */
1552 const char *beg_interval;
1553
1554 /* Address of the place where a forward jump should go to the end of
1555 the containing expression. Each alternative of an `or' -- except the
1556 last -- ends with a forward jump of this sort. */
1557 unsigned char *fixup_alt_jump = 0;
1558
1559 /* Counts open-groups as they are encountered. Remembered for the
1560 matching close-group on the compile stack, so the same register
1561 number is put in the stop_memory as the start_memory. */
1562 regnum_t regnum = 0;
1563
1564 #ifdef DEBUG
1565 DEBUG_PRINT1 ("\nCompiling pattern: ");
1566 if (debug)
1567 {
1568 unsigned debug_count;
1569
1570 for (debug_count = 0; debug_count < size; debug_count++)
1571 putchar (pattern[debug_count]);
1572 putchar ('\n');
1573 }
1574 #endif /* DEBUG */
1575
1576 /* Initialize the compile stack. */
1577 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1578 if (compile_stack.stack == NULL)
1579 return REG_ESPACE;
1580
1581 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1582 compile_stack.avail = 0;
1583
1584 /* Initialize the pattern buffer. */
1585 bufp->syntax = syntax;
1586 bufp->fastmap_accurate = 0;
1587 bufp->not_bol = bufp->not_eol = 0;
1588
1589 /* Set `used' to zero, so that if we return an error, the pattern
1590 printer (for debugging) will think there's no pattern. We reset it
1591 at the end. */
1592 bufp->used = 0;
1593
1594 /* Always count groups, whether or not bufp->no_sub is set. */
1595 bufp->re_nsub = 0;
1596
1597 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1598 /* Initialize the syntax table. */
1599 init_syntax_once ();
1600 #endif
1601
1602 if (bufp->allocated == 0)
1603 {
1604 if (bufp->buffer)
1605 { /* If zero allocated, but buffer is non-null, try to realloc
1606 enough space. This loses if buffer's address is bogus, but
1607 that is the user's responsibility. */
1608 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1609 }
1610 else
1611 { /* Caller did not allocate a buffer. Do it for them. */
1612 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1613 }
1614 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
1615
1616 bufp->allocated = INIT_BUF_SIZE;
1617 }
1618
1619 begalt = b = bufp->buffer;
1620
1621 /* Loop through the uncompiled pattern until we're at the end. */
1622 while (p != pend)
1623 {
1624 PATFETCH (c);
1625
1626 switch (c)
1627 {
1628 case '^':
1629 {
1630 if ( /* If at start of pattern, it's an operator. */
1631 p == pattern + 1
1632 /* If context independent, it's an operator. */
1633 || syntax & RE_CONTEXT_INDEP_ANCHORS
1634 /* Otherwise, depends on what's come before. */
1635 || at_begline_loc_p (pattern, p, syntax))
1636 BUF_PUSH (begline);
1637 else
1638 goto normal_char;
1639 }
1640 break;
1641
1642
1643 case '$':
1644 {
1645 if ( /* If at end of pattern, it's an operator. */
1646 p == pend
1647 /* If context independent, it's an operator. */
1648 || syntax & RE_CONTEXT_INDEP_ANCHORS
1649 /* Otherwise, depends on what's next. */
1650 || at_endline_loc_p (p, pend, syntax))
1651 BUF_PUSH (endline);
1652 else
1653 goto normal_char;
1654 }
1655 break;
1656
1657
1658 case '+':
1659 case '?':
1660 if ((syntax & RE_BK_PLUS_QM)
1661 || (syntax & RE_LIMITED_OPS))
1662 goto normal_char;
1663 handle_plus:
1664 case '*':
1665 /* If there is no previous pattern... */
1666 if (!laststart)
1667 {
1668 if (syntax & RE_CONTEXT_INVALID_OPS)
1669 FREE_STACK_RETURN (REG_BADRPT);
1670 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1671 goto normal_char;
1672 }
1673
1674 {
1675 /* Are we optimizing this jump? */
1676 boolean keep_string_p = false;
1677
1678 /* 1 means zero (many) matches is allowed. */
1679 char zero_times_ok = 0, many_times_ok = 0;
1680
1681 /* If there is a sequence of repetition chars, collapse it
1682 down to just one (the right one). We can't combine
1683 interval operators with these because of, e.g., `a{2}*',
1684 which should only match an even number of `a's. */
1685
1686 for (;;)
1687 {
1688 zero_times_ok |= c != '+';
1689 many_times_ok |= c != '?';
1690
1691 if (p == pend)
1692 break;
1693
1694 PATFETCH (c);
1695
1696 if (c == '*'
1697 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1698 ;
1699
1700 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1701 {
1702 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
1703
1704 PATFETCH (c1);
1705 if (!(c1 == '+' || c1 == '?'))
1706 {
1707 PATUNFETCH;
1708 PATUNFETCH;
1709 break;
1710 }
1711
1712 c = c1;
1713 }
1714 else
1715 {
1716 PATUNFETCH;
1717 break;
1718 }
1719
1720 /* If we get here, we found another repeat character. */
1721 }
1722
1723 /* Star, etc. applied to an empty pattern is equivalent
1724 to an empty pattern. */
1725 if (!laststart)
1726 break;
1727
1728 /* Now we know whether or not zero matches is allowed
1729 and also whether or not two or more matches is allowed. */
1730 if (many_times_ok)
1731 { /* More than one repetition is allowed, so put in at the
1732 end a backward relative jump from `b' to before the next
1733 jump we're going to put in below (which jumps from
1734 laststart to after this jump).
1735
1736 But if we are at the `*' in the exact sequence `.*\n',
1737 insert an unconditional jump backwards to the .,
1738 instead of the beginning of the loop. This way we only
1739 push a failure point once, instead of every time
1740 through the loop. */
1741 assert (p - 1 > pattern);
1742
1743 /* Allocate the space for the jump. */
1744 GET_BUFFER_SPACE (3);
1745
1746 /* We know we are not at the first character of the pattern,
1747 because laststart was nonzero. And we've already
1748 incremented `p', by the way, to be the character after
1749 the `*'. Do we have to do something analogous here
1750 for null bytes, because of RE_DOT_NOT_NULL? */
1751 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1752 && zero_times_ok
1753 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1754 && !(syntax & RE_DOT_NEWLINE))
1755 { /* We have .*\n. */
1756 STORE_JUMP (jump, b, laststart);
1757 keep_string_p = true;
1758 }
1759 else
1760 /* Anything else. */
1761 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1762
1763 /* We've added more stuff to the buffer. */
1764 b += 3;
1765 }
1766
1767 /* On failure, jump from laststart to b + 3, which will be the
1768 end of the buffer after this jump is inserted. */
1769 GET_BUFFER_SPACE (3);
1770 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1771 : on_failure_jump,
1772 laststart, b + 3);
1773 pending_exact = 0;
1774 b += 3;
1775
1776 if (!zero_times_ok)
1777 {
1778 /* At least one repetition is required, so insert a
1779 `dummy_failure_jump' before the initial
1780 `on_failure_jump' instruction of the loop. This
1781 effects a skip over that instruction the first time
1782 we hit that loop. */
1783 GET_BUFFER_SPACE (3);
1784 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1785 b += 3;
1786 }
1787 }
1788 break;
1789
1790
1791 case '.':
1792 laststart = b;
1793 BUF_PUSH (anychar);
1794 break;
1795
1796
1797 case '[':
1798 {
1799 boolean had_char_class = false;
1800
1801 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
1802
1803 /* Ensure that we have enough space to push a charset: the
1804 opcode, the length count, and the bitset; 34 bytes in all. */
1805 GET_BUFFER_SPACE (34);
1806
1807 laststart = b;
1808
1809 /* We test `*p == '^' twice, instead of using an if
1810 statement, so we only need one BUF_PUSH. */
1811 BUF_PUSH (*p == '^' ? charset_not : charset);
1812 if (*p == '^')
1813 p++;
1814
1815 /* Remember the first position in the bracket expression. */
1816 p1 = p;
1817
1818 /* Push the number of bytes in the bitmap. */
1819 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1820
1821 /* Clear the whole map. */
1822 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1823
1824 /* charset_not matches newline according to a syntax bit. */
1825 if ((re_opcode_t) b[-2] == charset_not
1826 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1827 SET_LIST_BIT ('\n');
1828
1829 /* Read in characters and ranges, setting map bits. */
1830 for (;;)
1831 {
1832 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
1833
1834 PATFETCH (c);
1835
1836 /* \ might escape characters inside [...] and [^...]. */
1837 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1838 {
1839 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
1840
1841 PATFETCH (c1);
1842 SET_LIST_BIT (c1);
1843 continue;
1844 }
1845
1846 /* Could be the end of the bracket expression. If it's
1847 not (i.e., when the bracket expression is `[]' so
1848 far), the ']' character bit gets set way below. */
1849 if (c == ']' && p != p1 + 1)
1850 break;
1851
1852 /* Look ahead to see if it's a range when the last thing
1853 was a character class. */
1854 if (had_char_class && c == '-' && *p != ']')
1855 FREE_STACK_RETURN (REG_ERANGE);
1856
1857 /* Look ahead to see if it's a range when the last thing
1858 was a character: if this is a hyphen not at the
1859 beginning or the end of a list, then it's the range
1860 operator. */
1861 if (c == '-'
1862 && !(p - 2 >= pattern && p[-2] == '[')
1863 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1864 && *p != ']')
1865 {
1866 reg_errcode_t ret
1867 = compile_range (&p, pend, translate, syntax, b);
1868 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
1869 }
1870
1871 else if (p[0] == '-' && p[1] != ']')
1872 { /* This handles ranges made up of characters only. */
1873 reg_errcode_t ret;
1874
1875 /* Move past the `-'. */
1876 PATFETCH (c1);
1877
1878 ret = compile_range (&p, pend, translate, syntax, b);
1879 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret);
1880 }
1881
1882 /* See if we're at the beginning of a possible character
1883 class. */
1884
1885 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
1886 { /* Leave room for the null. */
1887 char str[CHAR_CLASS_MAX_LENGTH + 1];
1888
1889 PATFETCH (c);
1890 c1 = 0;
1891
1892 /* If pattern is `[[:'. */
1893 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
1894
1895 for (;;)
1896 {
1897 PATFETCH (c);
1898 if (c == ':' || c == ']' || p == pend
1899 || c1 == CHAR_CLASS_MAX_LENGTH)
1900 break;
1901 str[c1++] = c;
1902 }
1903 str[c1] = '\0';
1904
1905 /* If isn't a word bracketed by `[:' and:`]':
1906 undo the ending character, the letters, and leave
1907 the leading `:' and `[' (but set bits for them). */
1908 if (c == ':' && *p == ']')
1909 {
1910 int ch;
1911 boolean is_alnum = STREQ (str, "alnum");
1912 boolean is_alpha = STREQ (str, "alpha");
1913 boolean is_blank = STREQ (str, "blank");
1914 boolean is_cntrl = STREQ (str, "cntrl");
1915 boolean is_digit = STREQ (str, "digit");
1916 boolean is_graph = STREQ (str, "graph");
1917 boolean is_lower = STREQ (str, "lower");
1918 boolean is_print = STREQ (str, "print");
1919 boolean is_punct = STREQ (str, "punct");
1920 boolean is_space = STREQ (str, "space");
1921 boolean is_upper = STREQ (str, "upper");
1922 boolean is_xdigit = STREQ (str, "xdigit");
1923
1924 if (!IS_CHAR_CLASS (str))
1925 FREE_STACK_RETURN (REG_ECTYPE);
1926
1927 /* Throw away the ] at the end of the character
1928 class. */
1929 PATFETCH (c);
1930
1931 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
1932
1933 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
1934 {
1935 /* This was split into 3 if's to
1936 avoid an arbitrary limit in some compiler. */
1937 if ( (is_alnum && ISALNUM (ch))
1938 || (is_alpha && ISALPHA (ch))
1939 || (is_blank && ISBLANK (ch))
1940 || (is_cntrl && ISCNTRL (ch)))
1941 SET_LIST_BIT (ch);
1942 if ( (is_digit && ISDIGIT (ch))
1943 || (is_graph && ISGRAPH (ch))
1944 || (is_lower && ISLOWER (ch))
1945 || (is_print && ISPRINT (ch)))
1946 SET_LIST_BIT (ch);
1947 if ( (is_punct && ISPUNCT (ch))
1948 || (is_space && ISSPACE (ch))
1949 || (is_upper && ISUPPER (ch))
1950 || (is_xdigit && ISXDIGIT (ch)))
1951 SET_LIST_BIT (ch);
1952 }
1953 had_char_class = true;
1954 }
1955 else
1956 {
1957 c1++;
1958 while (c1--)
1959 PATUNFETCH;
1960 SET_LIST_BIT ('[');
1961 SET_LIST_BIT (':');
1962 had_char_class = false;
1963 }
1964 }
1965 else
1966 {
1967 had_char_class = false;
1968 SET_LIST_BIT (c);
1969 }
1970 }
1971
1972 /* Discard any (non)matching list bytes that are all 0 at the
1973 end of the map. Decrease the map-length byte too. */
1974 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1975 b[-1]--;
1976 b += b[-1];
1977 }
1978 break;
1979
1980
1981 case '(':
1982 if (syntax & RE_NO_BK_PARENS)
1983 goto handle_open;
1984 else
1985 goto normal_char;
1986
1987
1988 case ')':
1989 if (syntax & RE_NO_BK_PARENS)
1990 goto handle_close;
1991 else
1992 goto normal_char;
1993
1994
1995 case '\n':
1996 if (syntax & RE_NEWLINE_ALT)
1997 goto handle_alt;
1998 else
1999 goto normal_char;
2000
2001
2002 case '|':
2003 if (syntax & RE_NO_BK_VBAR)
2004 goto handle_alt;
2005 else
2006 goto normal_char;
2007
2008
2009 case '{':
2010 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2011 goto handle_interval;
2012 else
2013 goto normal_char;
2014
2015
2016 case '\\':
2017 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2018
2019 /* Do not translate the character after the \, so that we can
2020 distinguish, e.g., \B from \b, even if we normally would
2021 translate, e.g., B to b. */
2022 PATFETCH_RAW (c);
2023
2024 switch (c)
2025 {
2026 case '(':
2027 if (syntax & RE_NO_BK_PARENS)
2028 goto normal_backslash;
2029
2030 handle_open:
2031 bufp->re_nsub++;
2032 regnum++;
2033
2034 if (COMPILE_STACK_FULL)
2035 {
2036 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2037 compile_stack_elt_t);
2038 if (compile_stack.stack == NULL) return REG_ESPACE;
2039
2040 compile_stack.size <<= 1;
2041 }
2042
2043 /* These are the values to restore when we hit end of this
2044 group. They are all relative offsets, so that if the
2045 whole pattern moves because of realloc, they will still
2046 be valid. */
2047 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2048 COMPILE_STACK_TOP.fixup_alt_jump
2049 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2050 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2051 COMPILE_STACK_TOP.regnum = regnum;
2052
2053 /* We will eventually replace the 0 with the number of
2054 groups inner to this one. But do not push a
2055 start_memory for groups beyond the last one we can
2056 represent in the compiled pattern. */
2057 if (regnum <= MAX_REGNUM)
2058 {
2059 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
2060 BUF_PUSH_3 (start_memory, regnum, 0);
2061 }
2062
2063 compile_stack.avail++;
2064
2065 fixup_alt_jump = 0;
2066 laststart = 0;
2067 begalt = b;
2068 /* If we've reached MAX_REGNUM groups, then this open
2069 won't actually generate any code, so we'll have to
2070 clear pending_exact explicitly. */
2071 pending_exact = 0;
2072 break;
2073
2074
2075 case ')':
2076 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2077
2078 if (COMPILE_STACK_EMPTY)
2079 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2080 goto normal_backslash;
2081 else
2082 FREE_STACK_RETURN (REG_ERPAREN);
2083
2084 handle_close:
2085 if (fixup_alt_jump)
2086 { /* Push a dummy failure point at the end of the
2087 alternative for a possible future
2088 `pop_failure_jump' to pop. See comments at
2089 `push_dummy_failure' in `re_match_2'. */
2090 BUF_PUSH (push_dummy_failure);
2091
2092 /* We allocated space for this jump when we assigned
2093 to `fixup_alt_jump', in the `handle_alt' case below. */
2094 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
2095 }
2096
2097 /* See similar code for backslashed left paren above. */
2098 if (COMPILE_STACK_EMPTY)
2099 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2100 goto normal_char;
2101 else
2102 FREE_STACK_RETURN (REG_ERPAREN);
2103
2104 /* Since we just checked for an empty stack above, this
2105 ``can't happen''. */
2106 assert (compile_stack.avail != 0);
2107 {
2108 /* We don't just want to restore into `regnum', because
2109 later groups should continue to be numbered higher,
2110 as in `(ab)c(de)' -- the second group is #2. */
2111 regnum_t this_group_regnum;
2112
2113 compile_stack.avail--;
2114 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2115 fixup_alt_jump
2116 = COMPILE_STACK_TOP.fixup_alt_jump
2117 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2118 : 0;
2119 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2120 this_group_regnum = COMPILE_STACK_TOP.regnum;
2121 /* If we've reached MAX_REGNUM groups, then this open
2122 won't actually generate any code, so we'll have to
2123 clear pending_exact explicitly. */
2124 pending_exact = 0;
2125
2126 /* We're at the end of the group, so now we know how many
2127 groups were inside this one. */
2128 if (this_group_regnum <= MAX_REGNUM)
2129 {
2130 unsigned char *inner_group_loc
2131 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
2132
2133 *inner_group_loc = regnum - this_group_regnum;
2134 BUF_PUSH_3 (stop_memory, this_group_regnum,
2135 regnum - this_group_regnum);
2136 }
2137 }
2138 break;
2139
2140
2141 case '|': /* `\|'. */
2142 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2143 goto normal_backslash;
2144 handle_alt:
2145 if (syntax & RE_LIMITED_OPS)
2146 goto normal_char;
2147
2148 /* Insert before the previous alternative a jump which
2149 jumps to this alternative if the former fails. */
2150 GET_BUFFER_SPACE (3);
2151 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2152 pending_exact = 0;
2153 b += 3;
2154
2155 /* The alternative before this one has a jump after it
2156 which gets executed if it gets matched. Adjust that
2157 jump so it will jump to this alternative's analogous
2158 jump (put in below, which in turn will jump to the next
2159 (if any) alternative's such jump, etc.). The last such
2160 jump jumps to the correct final destination. A picture:
2161 _____ _____
2162 | | | |
2163 | v | v
2164 a | b | c
2165
2166 If we are at `b', then fixup_alt_jump right now points to a
2167 three-byte space after `a'. We'll put in the jump, set
2168 fixup_alt_jump to right after `b', and leave behind three
2169 bytes which we'll fill in when we get to after `c'. */
2170
2171 if (fixup_alt_jump)
2172 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2173
2174 /* Mark and leave space for a jump after this alternative,
2175 to be filled in later either by next alternative or
2176 when know we're at the end of a series of alternatives. */
2177 fixup_alt_jump = b;
2178 GET_BUFFER_SPACE (3);
2179 b += 3;
2180
2181 laststart = 0;
2182 begalt = b;
2183 break;
2184
2185
2186 case '{':
2187 /* If \{ is a literal. */
2188 if (!(syntax & RE_INTERVALS)
2189 /* If we're at `\{' and it's not the open-interval
2190 operator. */
2191 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2192 || (p - 2 == pattern && p == pend))
2193 goto normal_backslash;
2194
2195 handle_interval:
2196 {
2197 /* If got here, then the syntax allows intervals. */
2198
2199 /* At least (most) this many matches must be made. */
2200 int lower_bound = -1, upper_bound = -1;
2201
2202 beg_interval = p - 1;
2203
2204 if (p == pend)
2205 {
2206 if (syntax & RE_NO_BK_BRACES)
2207 goto unfetch_interval;
2208 else
2209 FREE_STACK_RETURN (REG_EBRACE);
2210 }
2211
2212 GET_UNSIGNED_NUMBER (lower_bound);
2213
2214 if (c == ',')
2215 {
2216 GET_UNSIGNED_NUMBER (upper_bound);
2217 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
2218 }
2219 else
2220 /* Interval such as `{1}' => match exactly once. */
2221 upper_bound = lower_bound;
2222
2223 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2224 || lower_bound > upper_bound)
2225 {
2226 if (syntax & RE_NO_BK_BRACES)
2227 goto unfetch_interval;
2228 else
2229 FREE_STACK_RETURN (REG_BADBR);
2230 }
2231
2232 if (!(syntax & RE_NO_BK_BRACES))
2233 {
2234 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2235
2236 PATFETCH (c);
2237 }
2238
2239 if (c != '}')
2240 {
2241 if (syntax & RE_NO_BK_BRACES)
2242 goto unfetch_interval;
2243 else
2244 FREE_STACK_RETURN (REG_BADBR);
2245 }
2246
2247 /* We just parsed a valid interval. */
2248
2249 /* If it's invalid to have no preceding re. */
2250 if (!laststart)
2251 {
2252 if (syntax & RE_CONTEXT_INVALID_OPS)
2253 FREE_STACK_RETURN (REG_BADRPT);
2254 else if (syntax & RE_CONTEXT_INDEP_OPS)
2255 laststart = b;
2256 else
2257 goto unfetch_interval;
2258 }
2259
2260 /* If the upper bound is zero, don't want to succeed at
2261 all; jump from `laststart' to `b + 3', which will be
2262 the end of the buffer after we insert the jump. */
2263 if (upper_bound == 0)
2264 {
2265 GET_BUFFER_SPACE (3);
2266 INSERT_JUMP (jump, laststart, b + 3);
2267 b += 3;
2268 }
2269
2270 /* Otherwise, we have a nontrivial interval. When
2271 we're all done, the pattern will look like:
2272 set_number_at <jump count> <upper bound>
2273 set_number_at <succeed_n count> <lower bound>
2274 succeed_n <after jump addr> <succeed_n count>
2275 <body of loop>
2276 jump_n <succeed_n addr> <jump count>
2277 (The upper bound and `jump_n' are omitted if
2278 `upper_bound' is 1, though.) */
2279 else
2280 { /* If the upper bound is > 1, we need to insert
2281 more at the end of the loop. */
2282 unsigned nbytes = 10 + (upper_bound > 1) * 10;
2283
2284 GET_BUFFER_SPACE (nbytes);
2285
2286 /* Initialize lower bound of the `succeed_n', even
2287 though it will be set during matching by its
2288 attendant `set_number_at' (inserted next),
2289 because `re_compile_fastmap' needs to know.
2290 Jump to the `jump_n' we might insert below. */
2291 INSERT_JUMP2 (succeed_n, laststart,
2292 b + 5 + (upper_bound > 1) * 5,
2293 lower_bound);
2294 b += 5;
2295
2296 /* Code to initialize the lower bound. Insert
2297 before the `succeed_n'. The `5' is the last two
2298 bytes of this `set_number_at', plus 3 bytes of
2299 the following `succeed_n'. */
2300 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2301 b += 5;
2302
2303 if (upper_bound > 1)
2304 { /* More than one repetition is allowed, so
2305 append a backward jump to the `succeed_n'
2306 that starts this interval.
2307
2308 When we've reached this during matching,
2309 we'll have matched the interval once, so
2310 jump back only `upper_bound - 1' times. */
2311 STORE_JUMP2 (jump_n, b, laststart + 5,
2312 upper_bound - 1);
2313 b += 5;
2314
2315 /* The location we want to set is the second
2316 parameter of the `jump_n'; that is `b-2' as
2317 an absolute address. `laststart' will be
2318 the `set_number_at' we're about to insert;
2319 `laststart+3' the number to set, the source
2320 for the relative address. But we are
2321 inserting into the middle of the pattern --
2322 so everything is getting moved up by 5.
2323 Conclusion: (b - 2) - (laststart + 3) + 5,
2324 i.e., b - laststart.
2325
2326 We insert this at the beginning of the loop
2327 so that if we fail during matching, we'll
2328 reinitialize the bounds. */
2329 insert_op2 (set_number_at, laststart, b - laststart,
2330 upper_bound - 1, b);
2331 b += 5;
2332 }
2333 }
2334 pending_exact = 0;
2335 beg_interval = NULL;
2336 }
2337 break;
2338
2339 unfetch_interval:
2340 /* If an invalid interval, match the characters as literals. */
2341 assert (beg_interval);
2342 p = beg_interval;
2343 beg_interval = NULL;
2344
2345 /* normal_char and normal_backslash need `c'. */
2346 PATFETCH (c);
2347
2348 if (!(syntax & RE_NO_BK_BRACES))
2349 {
2350 if (p > pattern && p[-1] == '\\')
2351 goto normal_backslash;
2352 }
2353 goto normal_char;
2354
2355 #ifdef emacs
2356 /* There is no way to specify the before_dot and after_dot
2357 operators. rms says this is ok. --karl */
2358 case '=':
2359 BUF_PUSH (at_dot);
2360 break;
2361
2362 case 's':
2363 laststart = b;
2364 PATFETCH (c);
2365 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2366 break;
2367
2368 case 'S':
2369 laststart = b;
2370 PATFETCH (c);
2371 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2372 break;
2373 #endif /* emacs */
2374
2375
2376 case 'w':
2377 laststart = b;
2378 BUF_PUSH (wordchar);
2379 break;
2380
2381
2382 case 'W':
2383 laststart = b;
2384 BUF_PUSH (notwordchar);
2385 break;
2386
2387
2388 case '<':
2389 BUF_PUSH (wordbeg);
2390 break;
2391
2392 case '>':
2393 BUF_PUSH (wordend);
2394 break;
2395
2396 case 'b':
2397 BUF_PUSH (wordbound);
2398 break;
2399
2400 case 'B':
2401 BUF_PUSH (notwordbound);
2402 break;
2403
2404 case '`':
2405 BUF_PUSH (begbuf);
2406 break;
2407
2408 case '\'':
2409 BUF_PUSH (endbuf);
2410 break;
2411
2412 case '1': case '2': case '3': case '4': case '5':
2413 case '6': case '7': case '8': case '9':
2414 if (syntax & RE_NO_BK_REFS)
2415 goto normal_char;
2416
2417 c1 = c - '0';
2418
2419 if (c1 > regnum)
2420 FREE_STACK_RETURN (REG_ESUBREG);
2421
2422 /* Can't back reference to a subexpression if inside of it. */
2423 if (group_in_compile_stack (compile_stack, c1))
2424 goto normal_char;
2425
2426 laststart = b;
2427 BUF_PUSH_2 (duplicate, c1);
2428 break;
2429
2430
2431 case '+':
2432 case '?':
2433 if (syntax & RE_BK_PLUS_QM)
2434 goto handle_plus;
2435 else
2436 goto normal_backslash;
2437
2438 default:
2439 normal_backslash:
2440 /* You might think it would be useful for \ to mean
2441 not to translate; but if we don't translate it
2442 it will never match anything. */
2443 c = TRANSLATE (c);
2444 goto normal_char;
2445 }
2446 break;
2447
2448
2449 default:
2450 /* Expects the character in `c'. */
2451 normal_char:
2452 /* If no exactn currently being built. */
2453 if (!pending_exact
2454
2455 /* If last exactn not at current position. */
2456 || pending_exact + *pending_exact + 1 != b
2457
2458 /* We have only one byte following the exactn for the count. */
2459 || *pending_exact == (1 << BYTEWIDTH) - 1
2460
2461 /* If followed by a repetition operator. */
2462 || *p == '*' || *p == '^'
2463 || ((syntax & RE_BK_PLUS_QM)
2464 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2465 : (*p == '+' || *p == '?'))
2466 || ((syntax & RE_INTERVALS)
2467 && ((syntax & RE_NO_BK_BRACES)
2468 ? *p == '{'
2469 : (p[0] == '\\' && p[1] == '{'))))
2470 {
2471 /* Start building a new exactn. */
2472
2473 laststart = b;
2474
2475 BUF_PUSH_2 (exactn, 0);
2476 pending_exact = b - 1;
2477 }
2478
2479 BUF_PUSH (c);
2480 (*pending_exact)++;
2481 break;
2482 } /* switch (c) */
2483 } /* while p != pend */
2484
2485
2486 /* Through the pattern now. */
2487
2488 if (fixup_alt_jump)
2489 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2490
2491 if (!COMPILE_STACK_EMPTY)
2492 FREE_STACK_RETURN (REG_EPAREN);
2493
2494 /* If we don't want backtracking, force success
2495 the first time we reach the end of the compiled pattern. */
2496 if (syntax & RE_NO_POSIX_BACKTRACKING)
2497 BUF_PUSH (succeed);
2498
2499 free (compile_stack.stack);
2500
2501 /* We have succeeded; set the length of the buffer. */
2502 bufp->used = b - bufp->buffer;
2503
2504 #ifdef DEBUG
2505 if (debug)
2506 {
2507 DEBUG_PRINT1 ("\nCompiled pattern: \n");
2508 print_compiled_pattern (bufp);
2509 }
2510 #endif /* DEBUG */
2511
2512 #ifndef MATCH_MAY_ALLOCATE
2513 /* Initialize the failure stack to the largest possible stack. This
2514 isn't necessary unless we're trying to avoid calling alloca in
2515 the search and match routines. */
2516 {
2517 int num_regs = bufp->re_nsub + 1;
2518
2519 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
2520 is strictly greater than re_max_failures, the largest possible stack
2521 is 2 * re_max_failures failure points. */
2522 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS))
2523 {
2524 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS);
2525
2526 #ifdef emacs
2527 if (! fail_stack.stack)
2528 fail_stack.stack
2529 = (fail_stack_elt_t *) xmalloc (fail_stack.size
2530 * sizeof (fail_stack_elt_t));
2531 else
2532 fail_stack.stack
2533 = (fail_stack_elt_t *) xrealloc (fail_stack.stack,
2534 (fail_stack.size
2535 * sizeof (fail_stack_elt_t)));
2536 #else /* not emacs */
2537 if (! fail_stack.stack)
2538 fail_stack.stack
2539 = (fail_stack_elt_t *) malloc (fail_stack.size
2540 * sizeof (fail_stack_elt_t));
2541 else
2542 fail_stack.stack
2543 = (fail_stack_elt_t *) realloc (fail_stack.stack,
2544 (fail_stack.size
2545 * sizeof (fail_stack_elt_t)));
2546 #endif /* not emacs */
2547 }
2548
2549 /* Initialize some other variables the matcher uses. */
2550 RETALLOC_IF (regstart, num_regs, const char *);
2551 RETALLOC_IF (regend, num_regs, const char *);
2552 RETALLOC_IF (old_regstart, num_regs, const char *);
2553 RETALLOC_IF (old_regend, num_regs, const char *);
2554 RETALLOC_IF (best_regstart, num_regs, const char *);
2555 RETALLOC_IF (best_regend, num_regs, const char *);
2556 RETALLOC_IF (reg_info, num_regs, register_info_type);
2557 RETALLOC_IF (reg_dummy, num_regs, const char *);
2558 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);
2559 }
2560 #endif
2561
2562 return REG_NOERROR;
2563 } /* regex_compile */
2564 \f
2565 /* Subroutines for `regex_compile'. */
2566
2567 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2568
2569 static void
2570 store_op1 (op, loc, arg)
2571 re_opcode_t op;
2572 unsigned char *loc;
2573 int arg;
2574 {
2575 *loc = (unsigned char) op;
2576 STORE_NUMBER (loc + 1, arg);
2577 }
2578
2579
2580 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2581
2582 static void
2583 store_op2 (op, loc, arg1, arg2)
2584 re_opcode_t op;
2585 unsigned char *loc;
2586 int arg1, arg2;
2587 {
2588 *loc = (unsigned char) op;
2589 STORE_NUMBER (loc + 1, arg1);
2590 STORE_NUMBER (loc + 3, arg2);
2591 }
2592
2593
2594 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2595 for OP followed by two-byte integer parameter ARG. */
2596
2597 static void
2598 insert_op1 (op, loc, arg, end)
2599 re_opcode_t op;
2600 unsigned char *loc;
2601 int arg;
2602 unsigned char *end;
2603 {
2604 register unsigned char *pfrom = end;
2605 register unsigned char *pto = end + 3;
2606
2607 while (pfrom != loc)
2608 *--pto = *--pfrom;
2609
2610 store_op1 (op, loc, arg);
2611 }
2612
2613
2614 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2615
2616 static void
2617 insert_op2 (op, loc, arg1, arg2, end)
2618 re_opcode_t op;
2619 unsigned char *loc;
2620 int arg1, arg2;
2621 unsigned char *end;
2622 {
2623 register unsigned char *pfrom = end;
2624 register unsigned char *pto = end + 5;
2625
2626 while (pfrom != loc)
2627 *--pto = *--pfrom;
2628
2629 store_op2 (op, loc, arg1, arg2);
2630 }
2631
2632
2633 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2634 after an alternative or a begin-subexpression. We assume there is at
2635 least one character before the ^. */
2636
2637 static boolean
2638 at_begline_loc_p (pattern, p, syntax)
2639 const char *pattern, *p;
2640 reg_syntax_t syntax;
2641 {
2642 const char *prev = p - 2;
2643 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2644
2645 return
2646 /* After a subexpression? */
2647 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2648 /* After an alternative? */
2649 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2650 }
2651
2652
2653 /* The dual of at_begline_loc_p. This one is for $. We assume there is
2654 at least one character after the $, i.e., `P < PEND'. */
2655
2656 static boolean
2657 at_endline_loc_p (p, pend, syntax)
2658 const char *p, *pend;
2659 int syntax;
2660 {
2661 const char *next = p;
2662 boolean next_backslash = *next == '\\';
2663 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2664
2665 return
2666 /* Before a subexpression? */
2667 (syntax & RE_NO_BK_PARENS ? *next == ')'
2668 : next_backslash && next_next && *next_next == ')')
2669 /* Before an alternative? */
2670 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2671 : next_backslash && next_next && *next_next == '|');
2672 }
2673
2674
2675 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2676 false if it's not. */
2677
2678 static boolean
2679 group_in_compile_stack (compile_stack, regnum)
2680 compile_stack_type compile_stack;
2681 regnum_t regnum;
2682 {
2683 int this_element;
2684
2685 for (this_element = compile_stack.avail - 1;
2686 this_element >= 0;
2687 this_element--)
2688 if (compile_stack.stack[this_element].regnum == regnum)
2689 return true;
2690
2691 return false;
2692 }
2693
2694
2695 /* Read the ending character of a range (in a bracket expression) from the
2696 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2697 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2698 Then we set the translation of all bits between the starting and
2699 ending characters (inclusive) in the compiled pattern B.
2700
2701 Return an error code.
2702
2703 We use these short variable names so we can use the same macros as
2704 `regex_compile' itself. */
2705
2706 static reg_errcode_t
2707 compile_range (p_ptr, pend, translate, syntax, b)
2708 const char **p_ptr, *pend;
2709 char *translate;
2710 reg_syntax_t syntax;
2711 unsigned char *b;
2712 {
2713 unsigned this_char;
2714
2715 const char *p = *p_ptr;
2716 int range_start, range_end;
2717
2718 if (p == pend)
2719 return REG_ERANGE;
2720
2721 /* Even though the pattern is a signed `char *', we need to fetch
2722 with unsigned char *'s; if the high bit of the pattern character
2723 is set, the range endpoints will be negative if we fetch using a
2724 signed char *.
2725
2726 We also want to fetch the endpoints without translating them; the
2727 appropriate translation is done in the bit-setting loop below. */
2728 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
2729 range_start = ((const unsigned char *) p)[-2];
2730 range_end = ((const unsigned char *) p)[0];
2731
2732 /* Have to increment the pointer into the pattern string, so the
2733 caller isn't still at the ending character. */
2734 (*p_ptr)++;
2735
2736 /* If the start is after the end, the range is empty. */
2737 if (range_start > range_end)
2738 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2739
2740 /* Here we see why `this_char' has to be larger than an `unsigned
2741 char' -- the range is inclusive, so if `range_end' == 0xff
2742 (assuming 8-bit characters), we would otherwise go into an infinite
2743 loop, since all characters <= 0xff. */
2744 for (this_char = range_start; this_char <= range_end; this_char++)
2745 {
2746 SET_LIST_BIT (TRANSLATE (this_char));
2747 }
2748
2749 return REG_NOERROR;
2750 }
2751 \f
2752 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2753 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2754 characters can start a string that matches the pattern. This fastmap
2755 is used by re_search to skip quickly over impossible starting points.
2756
2757 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2758 area as BUFP->fastmap.
2759
2760 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2761 the pattern buffer.
2762
2763 Returns 0 if we succeed, -2 if an internal error. */
2764
2765 int
2766 re_compile_fastmap (bufp)
2767 struct re_pattern_buffer *bufp;
2768 {
2769 int j, k;
2770 #ifdef MATCH_MAY_ALLOCATE
2771 fail_stack_type fail_stack;
2772 #endif
2773 #ifndef REGEX_MALLOC
2774 char *destination;
2775 #endif
2776 /* We don't push any register information onto the failure stack. */
2777 unsigned num_regs = 0;
2778
2779 register char *fastmap = bufp->fastmap;
2780 unsigned char *pattern = bufp->buffer;
2781 unsigned long size = bufp->used;
2782 unsigned char *p = pattern;
2783 register unsigned char *pend = pattern + size;
2784
2785 /* Assume that each path through the pattern can be null until
2786 proven otherwise. We set this false at the bottom of switch
2787 statement, to which we get only if a particular path doesn't
2788 match the empty string. */
2789 boolean path_can_be_null = true;
2790
2791 /* We aren't doing a `succeed_n' to begin with. */
2792 boolean succeed_n_p = false;
2793
2794 assert (fastmap != NULL && p != NULL);
2795
2796 INIT_FAIL_STACK ();
2797 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2798 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2799 bufp->can_be_null = 0;
2800
2801 while (1)
2802 {
2803 if (p == pend || *p == succeed)
2804 {
2805 /* We have reached the (effective) end of pattern. */
2806 if (!FAIL_STACK_EMPTY ())
2807 {
2808 bufp->can_be_null |= path_can_be_null;
2809
2810 /* Reset for next path. */
2811 path_can_be_null = true;
2812
2813 p = fail_stack.stack[--fail_stack.avail];
2814
2815 continue;
2816 }
2817 else
2818 break;
2819 }
2820
2821 /* We should never be about to go beyond the end of the pattern. */
2822 assert (p < pend);
2823
2824 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
2825 {
2826
2827 /* I guess the idea here is to simply not bother with a fastmap
2828 if a backreference is used, since it's too hard to figure out
2829 the fastmap for the corresponding group. Setting
2830 `can_be_null' stops `re_search_2' from using the fastmap, so
2831 that is all we do. */
2832 case duplicate:
2833 bufp->can_be_null = 1;
2834 return 0;
2835
2836
2837 /* Following are the cases which match a character. These end
2838 with `break'. */
2839
2840 case exactn:
2841 fastmap[p[1]] = 1;
2842 break;
2843
2844
2845 case charset:
2846 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2847 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2848 fastmap[j] = 1;
2849 break;
2850
2851
2852 case charset_not:
2853 /* Chars beyond end of map must be allowed. */
2854 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2855 fastmap[j] = 1;
2856
2857 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2858 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2859 fastmap[j] = 1;
2860 break;
2861
2862
2863 case wordchar:
2864 for (j = 0; j < (1 << BYTEWIDTH); j++)
2865 if (SYNTAX (j) == Sword)
2866 fastmap[j] = 1;
2867 break;
2868
2869
2870 case notwordchar:
2871 for (j = 0; j < (1 << BYTEWIDTH); j++)
2872 if (SYNTAX (j) != Sword)
2873 fastmap[j] = 1;
2874 break;
2875
2876
2877 case anychar:
2878 {
2879 int fastmap_newline = fastmap['\n'];
2880
2881 /* `.' matches anything ... */
2882 for (j = 0; j < (1 << BYTEWIDTH); j++)
2883 fastmap[j] = 1;
2884
2885 /* ... except perhaps newline. */
2886 if (!(bufp->syntax & RE_DOT_NEWLINE))
2887 fastmap['\n'] = fastmap_newline;
2888
2889 /* Return if we have already set `can_be_null'; if we have,
2890 then the fastmap is irrelevant. Something's wrong here. */
2891 else if (bufp->can_be_null)
2892 return 0;
2893
2894 /* Otherwise, have to check alternative paths. */
2895 break;
2896 }
2897
2898 #ifdef emacs
2899 case syntaxspec:
2900 k = *p++;
2901 for (j = 0; j < (1 << BYTEWIDTH); j++)
2902 if (SYNTAX (j) == (enum syntaxcode) k)
2903 fastmap[j] = 1;
2904 break;
2905
2906
2907 case notsyntaxspec:
2908 k = *p++;
2909 for (j = 0; j < (1 << BYTEWIDTH); j++)
2910 if (SYNTAX (j) != (enum syntaxcode) k)
2911 fastmap[j] = 1;
2912 break;
2913
2914
2915 /* All cases after this match the empty string. These end with
2916 `continue'. */
2917
2918
2919 case before_dot:
2920 case at_dot:
2921 case after_dot:
2922 continue;
2923 #endif /* not emacs */
2924
2925
2926 case no_op:
2927 case begline:
2928 case endline:
2929 case begbuf:
2930 case endbuf:
2931 case wordbound:
2932 case notwordbound:
2933 case wordbeg:
2934 case wordend:
2935 case push_dummy_failure:
2936 continue;
2937
2938
2939 case jump_n:
2940 case pop_failure_jump:
2941 case maybe_pop_jump:
2942 case jump:
2943 case jump_past_alt:
2944 case dummy_failure_jump:
2945 EXTRACT_NUMBER_AND_INCR (j, p);
2946 p += j;
2947 if (j > 0)
2948 continue;
2949
2950 /* Jump backward implies we just went through the body of a
2951 loop and matched nothing. Opcode jumped to should be
2952 `on_failure_jump' or `succeed_n'. Just treat it like an
2953 ordinary jump. For a * loop, it has pushed its failure
2954 point already; if so, discard that as redundant. */
2955 if ((re_opcode_t) *p != on_failure_jump
2956 && (re_opcode_t) *p != succeed_n)
2957 continue;
2958
2959 p++;
2960 EXTRACT_NUMBER_AND_INCR (j, p);
2961 p += j;
2962
2963 /* If what's on the stack is where we are now, pop it. */
2964 if (!FAIL_STACK_EMPTY ()
2965 && fail_stack.stack[fail_stack.avail - 1] == p)
2966 fail_stack.avail--;
2967
2968 continue;
2969
2970
2971 case on_failure_jump:
2972 case on_failure_keep_string_jump:
2973 handle_on_failure_jump:
2974 EXTRACT_NUMBER_AND_INCR (j, p);
2975
2976 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
2977 end of the pattern. We don't want to push such a point,
2978 since when we restore it above, entering the switch will
2979 increment `p' past the end of the pattern. We don't need
2980 to push such a point since we obviously won't find any more
2981 fastmap entries beyond `pend'. Such a pattern can match
2982 the null string, though. */
2983 if (p + j < pend)
2984 {
2985 if (!PUSH_PATTERN_OP (p + j, fail_stack))
2986 return -2;
2987 }
2988 else
2989 bufp->can_be_null = 1;
2990
2991 if (succeed_n_p)
2992 {
2993 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
2994 succeed_n_p = false;
2995 }
2996
2997 continue;
2998
2999
3000 case succeed_n:
3001 /* Get to the number of times to succeed. */
3002 p += 2;
3003
3004 /* Increment p past the n for when k != 0. */
3005 EXTRACT_NUMBER_AND_INCR (k, p);
3006 if (k == 0)
3007 {
3008 p -= 4;
3009 succeed_n_p = true; /* Spaghetti code alert. */
3010 goto handle_on_failure_jump;
3011 }
3012 continue;
3013
3014
3015 case set_number_at:
3016 p += 4;
3017 continue;
3018
3019
3020 case start_memory:
3021 case stop_memory:
3022 p += 2;
3023 continue;
3024
3025
3026 default:
3027 abort (); /* We have listed all the cases. */
3028 } /* switch *p++ */
3029
3030 /* Getting here means we have found the possible starting
3031 characters for one path of the pattern -- and that the empty
3032 string does not match. We need not follow this path further.
3033 Instead, look at the next alternative (remembered on the
3034 stack), or quit if no more. The test at the top of the loop
3035 does these things. */
3036 path_can_be_null = false;
3037 p = pend;
3038 } /* while p */
3039
3040 /* Set `can_be_null' for the last path (also the first path, if the
3041 pattern is empty). */
3042 bufp->can_be_null |= path_can_be_null;
3043 return 0;
3044 } /* re_compile_fastmap */
3045 \f
3046 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3047 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3048 this memory for recording register information. STARTS and ENDS
3049 must be allocated using the malloc library routine, and must each
3050 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3051
3052 If NUM_REGS == 0, then subsequent matches should allocate their own
3053 register data.
3054
3055 Unless this function is called, the first search or match using
3056 PATTERN_BUFFER will allocate its own register data, without
3057 freeing the old data. */
3058
3059 void
3060 re_set_registers (bufp, regs, num_regs, starts, ends)
3061 struct re_pattern_buffer *bufp;
3062 struct re_registers *regs;
3063 unsigned num_regs;
3064 regoff_t *starts, *ends;
3065 {
3066 if (num_regs)
3067 {
3068 bufp->regs_allocated = REGS_REALLOCATE;
3069 regs->num_regs = num_regs;
3070 regs->start = starts;
3071 regs->end = ends;
3072 }
3073 else
3074 {
3075 bufp->regs_allocated = REGS_UNALLOCATED;
3076 regs->num_regs = 0;
3077 regs->start = regs->end = (regoff_t *) 0;
3078 }
3079 }
3080 \f
3081 /* Searching routines. */
3082
3083 /* Like re_search_2, below, but only one string is specified, and
3084 doesn't let you say where to stop matching. */
3085
3086 int
3087 re_search (bufp, string, size, startpos, range, regs)
3088 struct re_pattern_buffer *bufp;
3089 const char *string;
3090 int size, startpos, range;
3091 struct re_registers *regs;
3092 {
3093 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3094 regs, size);
3095 }
3096
3097
3098 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3099 virtual concatenation of STRING1 and STRING2, starting first at index
3100 STARTPOS, then at STARTPOS + 1, and so on.
3101
3102 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3103
3104 RANGE is how far to scan while trying to match. RANGE = 0 means try
3105 only at STARTPOS; in general, the last start tried is STARTPOS +
3106 RANGE.
3107
3108 In REGS, return the indices of the virtual concatenation of STRING1
3109 and STRING2 that matched the entire BUFP->buffer and its contained
3110 subexpressions.
3111
3112 Do not consider matching one past the index STOP in the virtual
3113 concatenation of STRING1 and STRING2.
3114
3115 We return either the position in the strings at which the match was
3116 found, -1 if no match, or -2 if error (such as failure
3117 stack overflow). */
3118
3119 int
3120 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
3121 struct re_pattern_buffer *bufp;
3122 const char *string1, *string2;
3123 int size1, size2;
3124 int startpos;
3125 int range;
3126 struct re_registers *regs;
3127 int stop;
3128 {
3129 int val;
3130 register char *fastmap = bufp->fastmap;
3131 register char *translate = bufp->translate;
3132 int total_size = size1 + size2;
3133 int endpos = startpos + range;
3134
3135 /* Check for out-of-range STARTPOS. */
3136 if (startpos < 0 || startpos > total_size)
3137 return -1;
3138
3139 /* Fix up RANGE if it might eventually take us outside
3140 the virtual concatenation of STRING1 and STRING2. */
3141 if (endpos < -1)
3142 range = -1 - startpos;
3143 else if (endpos > total_size)
3144 range = total_size - startpos;
3145
3146 /* If the search isn't to be a backwards one, don't waste time in a
3147 search for a pattern that must be anchored. */
3148 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3149 {
3150 if (startpos > 0)
3151 return -1;
3152 else
3153 range = 1;
3154 }
3155
3156 /* Update the fastmap now if not correct already. */
3157 if (fastmap && !bufp->fastmap_accurate)
3158 if (re_compile_fastmap (bufp) == -2)
3159 return -2;
3160
3161 /* Loop through the string, looking for a place to start matching. */
3162 for (;;)
3163 {
3164 /* If a fastmap is supplied, skip quickly over characters that
3165 cannot be the start of a match. If the pattern can match the
3166 null string, however, we don't need to skip characters; we want
3167 the first null string. */
3168 if (fastmap && startpos < total_size && !bufp->can_be_null)
3169 {
3170 if (range > 0) /* Searching forwards. */
3171 {
3172 register const char *d;
3173 register int lim = 0;
3174 int irange = range;
3175
3176 if (startpos < size1 && startpos + range >= size1)
3177 lim = range - (size1 - startpos);
3178
3179 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
3180
3181 /* Written out as an if-else to avoid testing `translate'
3182 inside the loop. */
3183 if (translate)
3184 while (range > lim
3185 && !fastmap[(unsigned char)
3186 translate[(unsigned char) *d++]])
3187 range--;
3188 else
3189 while (range > lim && !fastmap[(unsigned char) *d++])
3190 range--;
3191
3192 startpos += irange - range;
3193 }
3194 else /* Searching backwards. */
3195 {
3196 register char c = (size1 == 0 || startpos >= size1
3197 ? string2[startpos - size1]
3198 : string1[startpos]);
3199
3200 if (!fastmap[(unsigned char) TRANSLATE (c)])
3201 goto advance;
3202 }
3203 }
3204
3205 /* If can't match the null string, and that's all we have left, fail. */
3206 if (range >= 0 && startpos == total_size && fastmap
3207 && !bufp->can_be_null)
3208 return -1;
3209
3210 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3211 startpos, regs, stop);
3212 #ifndef REGEX_MALLOC
3213 #ifdef C_ALLOCA
3214 alloca (0);
3215 #endif
3216 #endif
3217
3218 if (val >= 0)
3219 return startpos;
3220
3221 if (val == -2)
3222 return -2;
3223
3224 advance:
3225 if (!range)
3226 break;
3227 else if (range > 0)
3228 {
3229 range--;
3230 startpos++;
3231 }
3232 else
3233 {
3234 range++;
3235 startpos--;
3236 }
3237 }
3238 return -1;
3239 } /* re_search_2 */
3240 \f
3241 /* Declarations and macros for re_match_2. */
3242
3243 static int bcmp_translate ();
3244 static boolean alt_match_null_string_p (),
3245 common_op_match_null_string_p (),
3246 group_match_null_string_p ();
3247
3248 /* This converts PTR, a pointer into one of the search strings `string1'
3249 and `string2' into an offset from the beginning of that string. */
3250 #define POINTER_TO_OFFSET(ptr) \
3251 (FIRST_STRING_P (ptr) \
3252 ? ((regoff_t) ((ptr) - string1)) \
3253 : ((regoff_t) ((ptr) - string2 + size1)))
3254
3255 /* Macros for dealing with the split strings in re_match_2. */
3256
3257 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3258
3259 /* Call before fetching a character with *d. This switches over to
3260 string2 if necessary. */
3261 #define PREFETCH() \
3262 while (d == dend) \
3263 { \
3264 /* End of string2 => fail. */ \
3265 if (dend == end_match_2) \
3266 goto fail; \
3267 /* End of string1 => advance to string2. */ \
3268 d = string2; \
3269 dend = end_match_2; \
3270 }
3271
3272
3273 /* Test if at very beginning or at very end of the virtual concatenation
3274 of `string1' and `string2'. If only one string, it's `string2'. */
3275 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3276 #define AT_STRINGS_END(d) ((d) == end2)
3277
3278
3279 /* Test if D points to a character which is word-constituent. We have
3280 two special cases to check for: if past the end of string1, look at
3281 the first character in string2; and if before the beginning of
3282 string2, look at the last character in string1. */
3283 #define WORDCHAR_P(d) \
3284 (SYNTAX ((d) == end1 ? *string2 \
3285 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3286 == Sword)
3287
3288 /* Test if the character before D and the one at D differ with respect
3289 to being word-constituent. */
3290 #define AT_WORD_BOUNDARY(d) \
3291 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3292 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3293
3294
3295 /* Free everything we malloc. */
3296 #ifdef MATCH_MAY_ALLOCATE
3297 #ifdef REGEX_MALLOC
3298 #define FREE_VAR(var) if (var) free (var); var = NULL
3299 #define FREE_VARIABLES() \
3300 do { \
3301 FREE_VAR (fail_stack.stack); \
3302 FREE_VAR (regstart); \
3303 FREE_VAR (regend); \
3304 FREE_VAR (old_regstart); \
3305 FREE_VAR (old_regend); \
3306 FREE_VAR (best_regstart); \
3307 FREE_VAR (best_regend); \
3308 FREE_VAR (reg_info); \
3309 FREE_VAR (reg_dummy); \
3310 FREE_VAR (reg_info_dummy); \
3311 } while (0)
3312 #else /* not REGEX_MALLOC */
3313 /* This used to do alloca (0), but now we do that in the caller. */
3314 #define FREE_VARIABLES() /* Nothing */
3315 #endif /* not REGEX_MALLOC */
3316 #else
3317 #define FREE_VARIABLES() /* Do nothing! */
3318 #endif /* not MATCH_MAY_ALLOCATE */
3319
3320 /* These values must meet several constraints. They must not be valid
3321 register values; since we have a limit of 255 registers (because
3322 we use only one byte in the pattern for the register number), we can
3323 use numbers larger than 255. They must differ by 1, because of
3324 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3325 be larger than the value for the highest register, so we do not try
3326 to actually save any registers when none are active. */
3327 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3328 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3329 \f
3330 /* Matching routines. */
3331
3332 #ifndef emacs /* Emacs never uses this. */
3333 /* re_match is like re_match_2 except it takes only a single string. */
3334
3335 int
3336 re_match (bufp, string, size, pos, regs)
3337 struct re_pattern_buffer *bufp;
3338 const char *string;
3339 int size, pos;
3340 struct re_registers *regs;
3341 {
3342 int result = re_match_2_internal (bufp, NULL, 0, string, size,
3343 pos, regs, size);
3344 alloca (0);
3345 return result;
3346 }
3347 #endif /* not emacs */
3348
3349
3350 /* re_match_2 matches the compiled pattern in BUFP against the
3351 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3352 and SIZE2, respectively). We start matching at POS, and stop
3353 matching at STOP.
3354
3355 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3356 store offsets for the substring each group matched in REGS. See the
3357 documentation for exactly how many groups we fill.
3358
3359 We return -1 if no match, -2 if an internal error (such as the
3360 failure stack overflowing). Otherwise, we return the length of the
3361 matched substring. */
3362
3363 int
3364 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3365 struct re_pattern_buffer *bufp;
3366 const char *string1, *string2;
3367 int size1, size2;
3368 int pos;
3369 struct re_registers *regs;
3370 int stop;
3371 {
3372 int result = re_match_2_internal (bufp, string1, size1, string2, size2,
3373 pos, regs, stop);
3374 alloca (0);
3375 return result;
3376 }
3377
3378 /* This is a separate function so that we can force an alloca cleanup
3379 afterwards. */
3380 static int
3381 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
3382 struct re_pattern_buffer *bufp;
3383 const char *string1, *string2;
3384 int size1, size2;
3385 int pos;
3386 struct re_registers *regs;
3387 int stop;
3388 {
3389 /* General temporaries. */
3390 int mcnt;
3391 unsigned char *p1;
3392
3393 /* Just past the end of the corresponding string. */
3394 const char *end1, *end2;
3395
3396 /* Pointers into string1 and string2, just past the last characters in
3397 each to consider matching. */
3398 const char *end_match_1, *end_match_2;
3399
3400 /* Where we are in the data, and the end of the current string. */
3401 const char *d, *dend;
3402
3403 /* Where we are in the pattern, and the end of the pattern. */
3404 unsigned char *p = bufp->buffer;
3405 register unsigned char *pend = p + bufp->used;
3406
3407 /* Mark the opcode just after a start_memory, so we can test for an
3408 empty subpattern when we get to the stop_memory. */
3409 unsigned char *just_past_start_mem = 0;
3410
3411 /* We use this to map every character in the string. */
3412 char *translate = bufp->translate;
3413
3414 /* Failure point stack. Each place that can handle a failure further
3415 down the line pushes a failure point on this stack. It consists of
3416 restart, regend, and reg_info for all registers corresponding to
3417 the subexpressions we're currently inside, plus the number of such
3418 registers, and, finally, two char *'s. The first char * is where
3419 to resume scanning the pattern; the second one is where to resume
3420 scanning the strings. If the latter is zero, the failure point is
3421 a ``dummy''; if a failure happens and the failure point is a dummy,
3422 it gets discarded and the next next one is tried. */
3423 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3424 fail_stack_type fail_stack;
3425 #endif
3426 #ifdef DEBUG
3427 static unsigned failure_id = 0;
3428 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3429 #endif
3430
3431 /* We fill all the registers internally, independent of what we
3432 return, for use in backreferences. The number here includes
3433 an element for register zero. */
3434 unsigned num_regs = bufp->re_nsub + 1;
3435
3436 /* The currently active registers. */
3437 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3438 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3439
3440 /* Information on the contents of registers. These are pointers into
3441 the input strings; they record just what was matched (on this
3442 attempt) by a subexpression part of the pattern, that is, the
3443 regnum-th regstart pointer points to where in the pattern we began
3444 matching and the regnum-th regend points to right after where we
3445 stopped matching the regnum-th subexpression. (The zeroth register
3446 keeps track of what the whole pattern matches.) */
3447 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3448 const char **regstart, **regend;
3449 #endif
3450
3451 /* If a group that's operated upon by a repetition operator fails to
3452 match anything, then the register for its start will need to be
3453 restored because it will have been set to wherever in the string we
3454 are when we last see its open-group operator. Similarly for a
3455 register's end. */
3456 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3457 const char **old_regstart, **old_regend;
3458 #endif
3459
3460 /* The is_active field of reg_info helps us keep track of which (possibly
3461 nested) subexpressions we are currently in. The matched_something
3462 field of reg_info[reg_num] helps us tell whether or not we have
3463 matched any of the pattern so far this time through the reg_num-th
3464 subexpression. These two fields get reset each time through any
3465 loop their register is in. */
3466 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
3467 register_info_type *reg_info;
3468 #endif
3469
3470 /* The following record the register info as found in the above
3471 variables when we find a match better than any we've seen before.
3472 This happens as we backtrack through the failure points, which in
3473 turn happens only if we have not yet matched the entire string. */
3474 unsigned best_regs_set = false;
3475 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3476 const char **best_regstart, **best_regend;
3477 #endif
3478
3479 /* Logically, this is `best_regend[0]'. But we don't want to have to
3480 allocate space for that if we're not allocating space for anything
3481 else (see below). Also, we never need info about register 0 for
3482 any of the other register vectors, and it seems rather a kludge to
3483 treat `best_regend' differently than the rest. So we keep track of
3484 the end of the best match so far in a separate variable. We
3485 initialize this to NULL so that when we backtrack the first time
3486 and need to test it, it's not garbage. */
3487 const char *match_end = NULL;
3488
3489 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
3490 int set_regs_matched_done = 0;
3491
3492 /* Used when we pop values we don't care about. */
3493 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
3494 const char **reg_dummy;
3495 register_info_type *reg_info_dummy;
3496 #endif
3497
3498 #ifdef DEBUG
3499 /* Counts the total number of registers pushed. */
3500 unsigned num_regs_pushed = 0;
3501 #endif
3502
3503 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3504
3505 INIT_FAIL_STACK ();
3506
3507 #ifdef MATCH_MAY_ALLOCATE
3508 /* Do not bother to initialize all the register variables if there are
3509 no groups in the pattern, as it takes a fair amount of time. If
3510 there are groups, we include space for register 0 (the whole
3511 pattern), even though we never use it, since it simplifies the
3512 array indexing. We should fix this. */
3513 if (bufp->re_nsub)
3514 {
3515 regstart = REGEX_TALLOC (num_regs, const char *);
3516 regend = REGEX_TALLOC (num_regs, const char *);
3517 old_regstart = REGEX_TALLOC (num_regs, const char *);
3518 old_regend = REGEX_TALLOC (num_regs, const char *);
3519 best_regstart = REGEX_TALLOC (num_regs, const char *);
3520 best_regend = REGEX_TALLOC (num_regs, const char *);
3521 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3522 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3523 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3524
3525 if (!(regstart && regend && old_regstart && old_regend && reg_info
3526 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3527 {
3528 FREE_VARIABLES ();
3529 return -2;
3530 }
3531 }
3532 #if defined (REGEX_MALLOC)
3533 else
3534 {
3535 /* We must initialize all our variables to NULL, so that
3536 `FREE_VARIABLES' doesn't try to free them. */
3537 regstart = regend = old_regstart = old_regend = best_regstart
3538 = best_regend = reg_dummy = NULL;
3539 reg_info = reg_info_dummy = (register_info_type *) NULL;
3540 }
3541 #endif /* REGEX_MALLOC */
3542 #endif /* MATCH_MAY_ALLOCATE */
3543
3544 /* The starting position is bogus. */
3545 if (pos < 0 || pos > size1 + size2)
3546 {
3547 FREE_VARIABLES ();
3548 return -1;
3549 }
3550
3551 /* Initialize subexpression text positions to -1 to mark ones that no
3552 start_memory/stop_memory has been seen for. Also initialize the
3553 register information struct. */
3554 for (mcnt = 1; mcnt < num_regs; mcnt++)
3555 {
3556 regstart[mcnt] = regend[mcnt]
3557 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3558
3559 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3560 IS_ACTIVE (reg_info[mcnt]) = 0;
3561 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3562 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3563 }
3564
3565 /* We move `string1' into `string2' if the latter's empty -- but not if
3566 `string1' is null. */
3567 if (size2 == 0 && string1 != NULL)
3568 {
3569 string2 = string1;
3570 size2 = size1;
3571 string1 = 0;
3572 size1 = 0;
3573 }
3574 end1 = string1 + size1;
3575 end2 = string2 + size2;
3576
3577 /* Compute where to stop matching, within the two strings. */
3578 if (stop <= size1)
3579 {
3580 end_match_1 = string1 + stop;
3581 end_match_2 = string2;
3582 }
3583 else
3584 {
3585 end_match_1 = end1;
3586 end_match_2 = string2 + stop - size1;
3587 }
3588
3589 /* `p' scans through the pattern as `d' scans through the data.
3590 `dend' is the end of the input string that `d' points within. `d'
3591 is advanced into the following input string whenever necessary, but
3592 this happens before fetching; therefore, at the beginning of the
3593 loop, `d' can be pointing at the end of a string, but it cannot
3594 equal `string2'. */
3595 if (size1 > 0 && pos <= size1)
3596 {
3597 d = string1 + pos;
3598 dend = end_match_1;
3599 }
3600 else
3601 {
3602 d = string2 + pos - size1;
3603 dend = end_match_2;
3604 }
3605
3606 DEBUG_PRINT1 ("The compiled pattern is: ");
3607 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3608 DEBUG_PRINT1 ("The string to match is: `");
3609 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3610 DEBUG_PRINT1 ("'\n");
3611
3612 /* This loops over pattern commands. It exits by returning from the
3613 function if the match is complete, or it drops through if the match
3614 fails at this starting point in the input data. */
3615 for (;;)
3616 {
3617 DEBUG_PRINT2 ("\n0x%x: ", p);
3618
3619 if (p == pend)
3620 { /* End of pattern means we might have succeeded. */
3621 DEBUG_PRINT1 ("end of pattern ... ");
3622
3623 /* If we haven't matched the entire string, and we want the
3624 longest match, try backtracking. */
3625 if (d != end_match_2)
3626 {
3627 /* 1 if this match ends in the same string (string1 or string2)
3628 as the best previous match. */
3629 boolean same_str_p = (FIRST_STRING_P (match_end)
3630 == MATCHING_IN_FIRST_STRING);
3631 /* 1 if this match is the best seen so far. */
3632 boolean best_match_p;
3633
3634 /* AIX compiler got confused when this was combined
3635 with the previous declaration. */
3636 if (same_str_p)
3637 best_match_p = d > match_end;
3638 else
3639 best_match_p = !MATCHING_IN_FIRST_STRING;
3640
3641 DEBUG_PRINT1 ("backtracking.\n");
3642
3643 if (!FAIL_STACK_EMPTY ())
3644 { /* More failure points to try. */
3645
3646 /* If exceeds best match so far, save it. */
3647 if (!best_regs_set || best_match_p)
3648 {
3649 best_regs_set = true;
3650 match_end = d;
3651
3652 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3653
3654 for (mcnt = 1; mcnt < num_regs; mcnt++)
3655 {
3656 best_regstart[mcnt] = regstart[mcnt];
3657 best_regend[mcnt] = regend[mcnt];
3658 }
3659 }
3660 goto fail;
3661 }
3662
3663 /* If no failure points, don't restore garbage. And if
3664 last match is real best match, don't restore second
3665 best one. */
3666 else if (best_regs_set && !best_match_p)
3667 {
3668 restore_best_regs:
3669 /* Restore best match. It may happen that `dend ==
3670 end_match_1' while the restored d is in string2.
3671 For example, the pattern `x.*y.*z' against the
3672 strings `x-' and `y-z-', if the two strings are
3673 not consecutive in memory. */
3674 DEBUG_PRINT1 ("Restoring best registers.\n");
3675
3676 d = match_end;
3677 dend = ((d >= string1 && d <= end1)
3678 ? end_match_1 : end_match_2);
3679
3680 for (mcnt = 1; mcnt < num_regs; mcnt++)
3681 {
3682 regstart[mcnt] = best_regstart[mcnt];
3683 regend[mcnt] = best_regend[mcnt];
3684 }
3685 }
3686 } /* d != end_match_2 */
3687
3688 succeed:
3689 DEBUG_PRINT1 ("Accepting match.\n");
3690
3691 /* If caller wants register contents data back, do it. */
3692 if (regs && !bufp->no_sub)
3693 {
3694 /* Have the register data arrays been allocated? */
3695 if (bufp->regs_allocated == REGS_UNALLOCATED)
3696 { /* No. So allocate them with malloc. We need one
3697 extra element beyond `num_regs' for the `-1' marker
3698 GNU code uses. */
3699 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3700 regs->start = TALLOC (regs->num_regs, regoff_t);
3701 regs->end = TALLOC (regs->num_regs, regoff_t);
3702 if (regs->start == NULL || regs->end == NULL)
3703 return -2;
3704 bufp->regs_allocated = REGS_REALLOCATE;
3705 }
3706 else if (bufp->regs_allocated == REGS_REALLOCATE)
3707 { /* Yes. If we need more elements than were already
3708 allocated, reallocate them. If we need fewer, just
3709 leave it alone. */
3710 if (regs->num_regs < num_regs + 1)
3711 {
3712 regs->num_regs = num_regs + 1;
3713 RETALLOC (regs->start, regs->num_regs, regoff_t);
3714 RETALLOC (regs->end, regs->num_regs, regoff_t);
3715 if (regs->start == NULL || regs->end == NULL)
3716 return -2;
3717 }
3718 }
3719 else
3720 {
3721 /* These braces fend off a "empty body in an else-statement"
3722 warning under GCC when assert expands to nothing. */
3723 assert (bufp->regs_allocated == REGS_FIXED);
3724 }
3725
3726 /* Convert the pointer data in `regstart' and `regend' to
3727 indices. Register zero has to be set differently,
3728 since we haven't kept track of any info for it. */
3729 if (regs->num_regs > 0)
3730 {
3731 regs->start[0] = pos;
3732 regs->end[0] = (MATCHING_IN_FIRST_STRING
3733 ? ((regoff_t) (d - string1))
3734 : ((regoff_t) (d - string2 + size1)));
3735 }
3736
3737 /* Go through the first `min (num_regs, regs->num_regs)'
3738 registers, since that is all we initialized. */
3739 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3740 {
3741 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3742 regs->start[mcnt] = regs->end[mcnt] = -1;
3743 else
3744 {
3745 regs->start[mcnt]
3746 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
3747 regs->end[mcnt]
3748 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
3749 }
3750 }
3751
3752 /* If the regs structure we return has more elements than
3753 were in the pattern, set the extra elements to -1. If
3754 we (re)allocated the registers, this is the case,
3755 because we always allocate enough to have at least one
3756 -1 at the end. */
3757 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
3758 regs->start[mcnt] = regs->end[mcnt] = -1;
3759 } /* regs && !bufp->no_sub */
3760
3761 FREE_VARIABLES ();
3762 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3763 nfailure_points_pushed, nfailure_points_popped,
3764 nfailure_points_pushed - nfailure_points_popped);
3765 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3766
3767 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3768 ? string1
3769 : string2 - size1);
3770
3771 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3772
3773 return mcnt;
3774 }
3775
3776 /* Otherwise match next pattern command. */
3777 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3778 {
3779 /* Ignore these. Used to ignore the n of succeed_n's which
3780 currently have n == 0. */
3781 case no_op:
3782 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3783 break;
3784
3785 case succeed:
3786 DEBUG_PRINT1 ("EXECUTING succeed.\n");
3787 goto succeed;
3788
3789 /* Match the next n pattern characters exactly. The following
3790 byte in the pattern defines n, and the n bytes after that
3791 are the characters to match. */
3792 case exactn:
3793 mcnt = *p++;
3794 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3795
3796 /* This is written out as an if-else so we don't waste time
3797 testing `translate' inside the loop. */
3798 if (translate)
3799 {
3800 do
3801 {
3802 PREFETCH ();
3803 if (translate[(unsigned char) *d++] != (char) *p++)
3804 goto fail;
3805 }
3806 while (--mcnt);
3807 }
3808 else
3809 {
3810 do
3811 {
3812 PREFETCH ();
3813 if (*d++ != (char) *p++) goto fail;
3814 }
3815 while (--mcnt);
3816 }
3817 SET_REGS_MATCHED ();
3818 break;
3819
3820
3821 /* Match any character except possibly a newline or a null. */
3822 case anychar:
3823 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3824
3825 PREFETCH ();
3826
3827 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3828 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3829 goto fail;
3830
3831 SET_REGS_MATCHED ();
3832 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3833 d++;
3834 break;
3835
3836
3837 case charset:
3838 case charset_not:
3839 {
3840 register unsigned char c;
3841 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3842
3843 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3844
3845 PREFETCH ();
3846 c = TRANSLATE (*d); /* The character to match. */
3847
3848 /* Cast to `unsigned' instead of `unsigned char' in case the
3849 bit list is a full 32 bytes long. */
3850 if (c < (unsigned) (*p * BYTEWIDTH)
3851 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3852 not = !not;
3853
3854 p += 1 + *p;
3855
3856 if (!not) goto fail;
3857
3858 SET_REGS_MATCHED ();
3859 d++;
3860 break;
3861 }
3862
3863
3864 /* The beginning of a group is represented by start_memory.
3865 The arguments are the register number in the next byte, and the
3866 number of groups inner to this one in the next. The text
3867 matched within the group is recorded (in the internal
3868 registers data structure) under the register number. */
3869 case start_memory:
3870 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
3871
3872 /* Find out if this group can match the empty string. */
3873 p1 = p; /* To send to group_match_null_string_p. */
3874
3875 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3876 REG_MATCH_NULL_STRING_P (reg_info[*p])
3877 = group_match_null_string_p (&p1, pend, reg_info);
3878
3879 /* Save the position in the string where we were the last time
3880 we were at this open-group operator in case the group is
3881 operated upon by a repetition operator, e.g., with `(a*)*b'
3882 against `ab'; then we want to ignore where we are now in
3883 the string in case this attempt to match fails. */
3884 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3885 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3886 : regstart[*p];
3887 DEBUG_PRINT2 (" old_regstart: %d\n",
3888 POINTER_TO_OFFSET (old_regstart[*p]));
3889
3890 regstart[*p] = d;
3891 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
3892
3893 IS_ACTIVE (reg_info[*p]) = 1;
3894 MATCHED_SOMETHING (reg_info[*p]) = 0;
3895
3896 /* Clear this whenever we change the register activity status. */
3897 set_regs_matched_done = 0;
3898
3899 /* This is the new highest active register. */
3900 highest_active_reg = *p;
3901
3902 /* If nothing was active before, this is the new lowest active
3903 register. */
3904 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3905 lowest_active_reg = *p;
3906
3907 /* Move past the register number and inner group count. */
3908 p += 2;
3909 just_past_start_mem = p;
3910
3911 break;
3912
3913
3914 /* The stop_memory opcode represents the end of a group. Its
3915 arguments are the same as start_memory's: the register
3916 number, and the number of inner groups. */
3917 case stop_memory:
3918 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
3919
3920 /* We need to save the string position the last time we were at
3921 this close-group operator in case the group is operated
3922 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3923 against `aba'; then we want to ignore where we are now in
3924 the string in case this attempt to match fails. */
3925 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3926 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3927 : regend[*p];
3928 DEBUG_PRINT2 (" old_regend: %d\n",
3929 POINTER_TO_OFFSET (old_regend[*p]));
3930
3931 regend[*p] = d;
3932 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
3933
3934 /* This register isn't active anymore. */
3935 IS_ACTIVE (reg_info[*p]) = 0;
3936
3937 /* Clear this whenever we change the register activity status. */
3938 set_regs_matched_done = 0;
3939
3940 /* If this was the only register active, nothing is active
3941 anymore. */
3942 if (lowest_active_reg == highest_active_reg)
3943 {
3944 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3945 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3946 }
3947 else
3948 { /* We must scan for the new highest active register, since
3949 it isn't necessarily one less than now: consider
3950 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
3951 new highest active register is 1. */
3952 unsigned char r = *p - 1;
3953 while (r > 0 && !IS_ACTIVE (reg_info[r]))
3954 r--;
3955
3956 /* If we end up at register zero, that means that we saved
3957 the registers as the result of an `on_failure_jump', not
3958 a `start_memory', and we jumped to past the innermost
3959 `stop_memory'. For example, in ((.)*) we save
3960 registers 1 and 2 as a result of the *, but when we pop
3961 back to the second ), we are at the stop_memory 1.
3962 Thus, nothing is active. */
3963 if (r == 0)
3964 {
3965 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3966 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3967 }
3968 else
3969 highest_active_reg = r;
3970 }
3971
3972 /* If just failed to match something this time around with a
3973 group that's operated on by a repetition operator, try to
3974 force exit from the ``loop'', and restore the register
3975 information for this group that we had before trying this
3976 last match. */
3977 if ((!MATCHED_SOMETHING (reg_info[*p])
3978 || just_past_start_mem == p - 1)
3979 && (p + 2) < pend)
3980 {
3981 boolean is_a_jump_n = false;
3982
3983 p1 = p + 2;
3984 mcnt = 0;
3985 switch ((re_opcode_t) *p1++)
3986 {
3987 case jump_n:
3988 is_a_jump_n = true;
3989 case pop_failure_jump:
3990 case maybe_pop_jump:
3991 case jump:
3992 case dummy_failure_jump:
3993 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3994 if (is_a_jump_n)
3995 p1 += 2;
3996 break;
3997
3998 default:
3999 /* do nothing */ ;
4000 }
4001 p1 += mcnt;
4002
4003 /* If the next operation is a jump backwards in the pattern
4004 to an on_failure_jump right before the start_memory
4005 corresponding to this stop_memory, exit from the loop
4006 by forcing a failure after pushing on the stack the
4007 on_failure_jump's jump in the pattern, and d. */
4008 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
4009 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
4010 {
4011 /* If this group ever matched anything, then restore
4012 what its registers were before trying this last
4013 failed match, e.g., with `(a*)*b' against `ab' for
4014 regstart[1], and, e.g., with `((a*)*(b*)*)*'
4015 against `aba' for regend[3].
4016
4017 Also restore the registers for inner groups for,
4018 e.g., `((a*)(b*))*' against `aba' (register 3 would
4019 otherwise get trashed). */
4020
4021 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
4022 {
4023 unsigned r;
4024
4025 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
4026
4027 /* Restore this and inner groups' (if any) registers. */
4028 for (r = *p; r < *p + *(p + 1); r++)
4029 {
4030 regstart[r] = old_regstart[r];
4031
4032 /* xx why this test? */
4033 if ((int) old_regend[r] >= (int) regstart[r])
4034 regend[r] = old_regend[r];
4035 }
4036 }
4037 p1++;
4038 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4039 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
4040
4041 goto fail;
4042 }
4043 }
4044
4045 /* Move past the register number and the inner group count. */
4046 p += 2;
4047 break;
4048
4049
4050 /* \<digit> has been turned into a `duplicate' command which is
4051 followed by the numeric value of <digit> as the register number. */
4052 case duplicate:
4053 {
4054 register const char *d2, *dend2;
4055 int regno = *p++; /* Get which register to match against. */
4056 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4057
4058 /* Can't back reference a group which we've never matched. */
4059 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4060 goto fail;
4061
4062 /* Where in input to try to start matching. */
4063 d2 = regstart[regno];
4064
4065 /* Where to stop matching; if both the place to start and
4066 the place to stop matching are in the same string, then
4067 set to the place to stop, otherwise, for now have to use
4068 the end of the first string. */
4069
4070 dend2 = ((FIRST_STRING_P (regstart[regno])
4071 == FIRST_STRING_P (regend[regno]))
4072 ? regend[regno] : end_match_1);
4073 for (;;)
4074 {
4075 /* If necessary, advance to next segment in register
4076 contents. */
4077 while (d2 == dend2)
4078 {
4079 if (dend2 == end_match_2) break;
4080 if (dend2 == regend[regno]) break;
4081
4082 /* End of string1 => advance to string2. */
4083 d2 = string2;
4084 dend2 = regend[regno];
4085 }
4086 /* At end of register contents => success */
4087 if (d2 == dend2) break;
4088
4089 /* If necessary, advance to next segment in data. */
4090 PREFETCH ();
4091
4092 /* How many characters left in this segment to match. */
4093 mcnt = dend - d;
4094
4095 /* Want how many consecutive characters we can match in
4096 one shot, so, if necessary, adjust the count. */
4097 if (mcnt > dend2 - d2)
4098 mcnt = dend2 - d2;
4099
4100 /* Compare that many; failure if mismatch, else move
4101 past them. */
4102 if (translate
4103 ? bcmp_translate (d, d2, mcnt, translate)
4104 : bcmp (d, d2, mcnt))
4105 goto fail;
4106 d += mcnt, d2 += mcnt;
4107
4108 /* Do this because we've match some characters. */
4109 SET_REGS_MATCHED ();
4110 }
4111 }
4112 break;
4113
4114
4115 /* begline matches the empty string at the beginning of the string
4116 (unless `not_bol' is set in `bufp'), and, if
4117 `newline_anchor' is set, after newlines. */
4118 case begline:
4119 DEBUG_PRINT1 ("EXECUTING begline.\n");
4120
4121 if (AT_STRINGS_BEG (d))
4122 {
4123 if (!bufp->not_bol) break;
4124 }
4125 else if (d[-1] == '\n' && bufp->newline_anchor)
4126 {
4127 break;
4128 }
4129 /* In all other cases, we fail. */
4130 goto fail;
4131
4132
4133 /* endline is the dual of begline. */
4134 case endline:
4135 DEBUG_PRINT1 ("EXECUTING endline.\n");
4136
4137 if (AT_STRINGS_END (d))
4138 {
4139 if (!bufp->not_eol) break;
4140 }
4141
4142 /* We have to ``prefetch'' the next character. */
4143 else if ((d == end1 ? *string2 : *d) == '\n'
4144 && bufp->newline_anchor)
4145 {
4146 break;
4147 }
4148 goto fail;
4149
4150
4151 /* Match at the very beginning of the data. */
4152 case begbuf:
4153 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
4154 if (AT_STRINGS_BEG (d))
4155 break;
4156 goto fail;
4157
4158
4159 /* Match at the very end of the data. */
4160 case endbuf:
4161 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
4162 if (AT_STRINGS_END (d))
4163 break;
4164 goto fail;
4165
4166
4167 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
4168 pushes NULL as the value for the string on the stack. Then
4169 `pop_failure_point' will keep the current value for the
4170 string, instead of restoring it. To see why, consider
4171 matching `foo\nbar' against `.*\n'. The .* matches the foo;
4172 then the . fails against the \n. But the next thing we want
4173 to do is match the \n against the \n; if we restored the
4174 string value, we would be back at the foo.
4175
4176 Because this is used only in specific cases, we don't need to
4177 check all the things that `on_failure_jump' does, to make
4178 sure the right things get saved on the stack. Hence we don't
4179 share its code. The only reason to push anything on the
4180 stack at all is that otherwise we would have to change
4181 `anychar's code to do something besides goto fail in this
4182 case; that seems worse than this. */
4183 case on_failure_keep_string_jump:
4184 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
4185
4186 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4187 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
4188
4189 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
4190 break;
4191
4192
4193 /* Uses of on_failure_jump:
4194
4195 Each alternative starts with an on_failure_jump that points
4196 to the beginning of the next alternative. Each alternative
4197 except the last ends with a jump that in effect jumps past
4198 the rest of the alternatives. (They really jump to the
4199 ending jump of the following alternative, because tensioning
4200 these jumps is a hassle.)
4201
4202 Repeats start with an on_failure_jump that points past both
4203 the repetition text and either the following jump or
4204 pop_failure_jump back to this on_failure_jump. */
4205 case on_failure_jump:
4206 on_failure:
4207 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
4208
4209 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4210 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
4211
4212 /* If this on_failure_jump comes right before a group (i.e.,
4213 the original * applied to a group), save the information
4214 for that group and all inner ones, so that if we fail back
4215 to this point, the group's information will be correct.
4216 For example, in \(a*\)*\1, we need the preceding group,
4217 and in \(\(a*\)b*\)\2, we need the inner group. */
4218
4219 /* We can't use `p' to check ahead because we push
4220 a failure point to `p + mcnt' after we do this. */
4221 p1 = p;
4222
4223 /* We need to skip no_op's before we look for the
4224 start_memory in case this on_failure_jump is happening as
4225 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
4226 against aba. */
4227 while (p1 < pend && (re_opcode_t) *p1 == no_op)
4228 p1++;
4229
4230 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
4231 {
4232 /* We have a new highest active register now. This will
4233 get reset at the start_memory we are about to get to,
4234 but we will have saved all the registers relevant to
4235 this repetition op, as described above. */
4236 highest_active_reg = *(p1 + 1) + *(p1 + 2);
4237 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4238 lowest_active_reg = *(p1 + 1);
4239 }
4240
4241 DEBUG_PRINT1 (":\n");
4242 PUSH_FAILURE_POINT (p + mcnt, d, -2);
4243 break;
4244
4245
4246 /* A smart repeat ends with `maybe_pop_jump'.
4247 We change it to either `pop_failure_jump' or `jump'. */
4248 case maybe_pop_jump:
4249 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4250 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
4251 {
4252 register unsigned char *p2 = p;
4253
4254 /* Compare the beginning of the repeat with what in the
4255 pattern follows its end. If we can establish that there
4256 is nothing that they would both match, i.e., that we
4257 would have to backtrack because of (as in, e.g., `a*a')
4258 then we can change to pop_failure_jump, because we'll
4259 never have to backtrack.
4260
4261 This is not true in the case of alternatives: in
4262 `(a|ab)*' we do need to backtrack to the `ab' alternative
4263 (e.g., if the string was `ab'). But instead of trying to
4264 detect that here, the alternative has put on a dummy
4265 failure point which is what we will end up popping. */
4266
4267 /* Skip over open/close-group commands.
4268 If what follows this loop is a ...+ construct,
4269 look at what begins its body, since we will have to
4270 match at least one of that. */
4271 while (1)
4272 {
4273 if (p2 + 2 < pend
4274 && ((re_opcode_t) *p2 == stop_memory
4275 || (re_opcode_t) *p2 == start_memory))
4276 p2 += 3;
4277 else if (p2 + 6 < pend
4278 && (re_opcode_t) *p2 == dummy_failure_jump)
4279 p2 += 6;
4280 else
4281 break;
4282 }
4283
4284 p1 = p + mcnt;
4285 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4286 to the `maybe_finalize_jump' of this case. Examine what
4287 follows. */
4288
4289 /* If we're at the end of the pattern, we can change. */
4290 if (p2 == pend)
4291 {
4292 /* Consider what happens when matching ":\(.*\)"
4293 against ":/". I don't really understand this code
4294 yet. */
4295 p[-3] = (unsigned char) pop_failure_jump;
4296 DEBUG_PRINT1
4297 (" End of pattern: change to `pop_failure_jump'.\n");
4298 }
4299
4300 else if ((re_opcode_t) *p2 == exactn
4301 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4302 {
4303 register unsigned char c
4304 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4305
4306 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4307 {
4308 p[-3] = (unsigned char) pop_failure_jump;
4309 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4310 c, p1[5]);
4311 }
4312
4313 else if ((re_opcode_t) p1[3] == charset
4314 || (re_opcode_t) p1[3] == charset_not)
4315 {
4316 int not = (re_opcode_t) p1[3] == charset_not;
4317
4318 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4319 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4320 not = !not;
4321
4322 /* `not' is equal to 1 if c would match, which means
4323 that we can't change to pop_failure_jump. */
4324 if (!not)
4325 {
4326 p[-3] = (unsigned char) pop_failure_jump;
4327 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4328 }
4329 }
4330 }
4331 else if ((re_opcode_t) *p2 == charset)
4332 {
4333 #ifdef DEBUG
4334 register unsigned char c
4335 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4336 #endif
4337
4338 if ((re_opcode_t) p1[3] == exactn
4339 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
4340 && (p2[1 + p1[4] / BYTEWIDTH]
4341 & (1 << (p1[4] % BYTEWIDTH)))))
4342 {
4343 p[-3] = (unsigned char) pop_failure_jump;
4344 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4345 c, p1[5]);
4346 }
4347
4348 else if ((re_opcode_t) p1[3] == charset_not)
4349 {
4350 int idx;
4351 /* We win if the charset_not inside the loop
4352 lists every character listed in the charset after. */
4353 for (idx = 0; idx < (int) p2[1]; idx++)
4354 if (! (p2[2 + idx] == 0
4355 || (idx < (int) p1[4]
4356 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
4357 break;
4358
4359 if (idx == p2[1])
4360 {
4361 p[-3] = (unsigned char) pop_failure_jump;
4362 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4363 }
4364 }
4365 else if ((re_opcode_t) p1[3] == charset)
4366 {
4367 int idx;
4368 /* We win if the charset inside the loop
4369 has no overlap with the one after the loop. */
4370 for (idx = 0;
4371 idx < (int) p2[1] && idx < (int) p1[4];
4372 idx++)
4373 if ((p2[2 + idx] & p1[5 + idx]) != 0)
4374 break;
4375
4376 if (idx == p2[1] || idx == p1[4])
4377 {
4378 p[-3] = (unsigned char) pop_failure_jump;
4379 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4380 }
4381 }
4382 }
4383 }
4384 p -= 2; /* Point at relative address again. */
4385 if ((re_opcode_t) p[-1] != pop_failure_jump)
4386 {
4387 p[-1] = (unsigned char) jump;
4388 DEBUG_PRINT1 (" Match => jump.\n");
4389 goto unconditional_jump;
4390 }
4391 /* Note fall through. */
4392
4393
4394 /* The end of a simple repeat has a pop_failure_jump back to
4395 its matching on_failure_jump, where the latter will push a
4396 failure point. The pop_failure_jump takes off failure
4397 points put on by this pop_failure_jump's matching
4398 on_failure_jump; we got through the pattern to here from the
4399 matching on_failure_jump, so didn't fail. */
4400 case pop_failure_jump:
4401 {
4402 /* We need to pass separate storage for the lowest and
4403 highest registers, even though we don't care about the
4404 actual values. Otherwise, we will restore only one
4405 register from the stack, since lowest will == highest in
4406 `pop_failure_point'. */
4407 unsigned dummy_low_reg, dummy_high_reg;
4408 unsigned char *pdummy;
4409 const char *sdummy;
4410
4411 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4412 POP_FAILURE_POINT (sdummy, pdummy,
4413 dummy_low_reg, dummy_high_reg,
4414 reg_dummy, reg_dummy, reg_info_dummy);
4415 }
4416 /* Note fall through. */
4417
4418
4419 /* Unconditionally jump (without popping any failure points). */
4420 case jump:
4421 unconditional_jump:
4422 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4423 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4424 p += mcnt; /* Do the jump. */
4425 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4426 break;
4427
4428
4429 /* We need this opcode so we can detect where alternatives end
4430 in `group_match_null_string_p' et al. */
4431 case jump_past_alt:
4432 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4433 goto unconditional_jump;
4434
4435
4436 /* Normally, the on_failure_jump pushes a failure point, which
4437 then gets popped at pop_failure_jump. We will end up at
4438 pop_failure_jump, also, and with a pattern of, say, `a+', we
4439 are skipping over the on_failure_jump, so we have to push
4440 something meaningless for pop_failure_jump to pop. */
4441 case dummy_failure_jump:
4442 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4443 /* It doesn't matter what we push for the string here. What
4444 the code at `fail' tests is the value for the pattern. */
4445 PUSH_FAILURE_POINT (0, 0, -2);
4446 goto unconditional_jump;
4447
4448
4449 /* At the end of an alternative, we need to push a dummy failure
4450 point in case we are followed by a `pop_failure_jump', because
4451 we don't want the failure point for the alternative to be
4452 popped. For example, matching `(a|ab)*' against `aab'
4453 requires that we match the `ab' alternative. */
4454 case push_dummy_failure:
4455 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4456 /* See comments just above at `dummy_failure_jump' about the
4457 two zeroes. */
4458 PUSH_FAILURE_POINT (0, 0, -2);
4459 break;
4460
4461 /* Have to succeed matching what follows at least n times.
4462 After that, handle like `on_failure_jump'. */
4463 case succeed_n:
4464 EXTRACT_NUMBER (mcnt, p + 2);
4465 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4466
4467 assert (mcnt >= 0);
4468 /* Originally, this is how many times we HAVE to succeed. */
4469 if (mcnt > 0)
4470 {
4471 mcnt--;
4472 p += 2;
4473 STORE_NUMBER_AND_INCR (p, mcnt);
4474 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4475 }
4476 else if (mcnt == 0)
4477 {
4478 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4479 p[2] = (unsigned char) no_op;
4480 p[3] = (unsigned char) no_op;
4481 goto on_failure;
4482 }
4483 break;
4484
4485 case jump_n:
4486 EXTRACT_NUMBER (mcnt, p + 2);
4487 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4488
4489 /* Originally, this is how many times we CAN jump. */
4490 if (mcnt)
4491 {
4492 mcnt--;
4493 STORE_NUMBER (p + 2, mcnt);
4494 goto unconditional_jump;
4495 }
4496 /* If don't have to jump any more, skip over the rest of command. */
4497 else
4498 p += 4;
4499 break;
4500
4501 case set_number_at:
4502 {
4503 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4504
4505 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4506 p1 = p + mcnt;
4507 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4508 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4509 STORE_NUMBER (p1, mcnt);
4510 break;
4511 }
4512
4513 case wordbound:
4514 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4515 if (AT_WORD_BOUNDARY (d))
4516 break;
4517 goto fail;
4518
4519 case notwordbound:
4520 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4521 if (AT_WORD_BOUNDARY (d))
4522 goto fail;
4523 break;
4524
4525 case wordbeg:
4526 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4527 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4528 break;
4529 goto fail;
4530
4531 case wordend:
4532 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4533 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4534 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4535 break;
4536 goto fail;
4537
4538 #ifdef emacs
4539 case before_dot:
4540 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4541 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4542 goto fail;
4543 break;
4544
4545 case at_dot:
4546 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4547 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4548 goto fail;
4549 break;
4550
4551 case after_dot:
4552 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4553 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4554 goto fail;
4555 break;
4556 #if 0 /* not emacs19 */
4557 case at_dot:
4558 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4559 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4560 goto fail;
4561 break;
4562 #endif /* not emacs19 */
4563
4564 case syntaxspec:
4565 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4566 mcnt = *p++;
4567 goto matchsyntax;
4568
4569 case wordchar:
4570 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4571 mcnt = (int) Sword;
4572 matchsyntax:
4573 PREFETCH ();
4574 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
4575 d++;
4576 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt)
4577 goto fail;
4578 SET_REGS_MATCHED ();
4579 break;
4580
4581 case notsyntaxspec:
4582 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4583 mcnt = *p++;
4584 goto matchnotsyntax;
4585
4586 case notwordchar:
4587 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4588 mcnt = (int) Sword;
4589 matchnotsyntax:
4590 PREFETCH ();
4591 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */
4592 d++;
4593 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt)
4594 goto fail;
4595 SET_REGS_MATCHED ();
4596 break;
4597
4598 #else /* not emacs */
4599 case wordchar:
4600 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4601 PREFETCH ();
4602 if (!WORDCHAR_P (d))
4603 goto fail;
4604 SET_REGS_MATCHED ();
4605 d++;
4606 break;
4607
4608 case notwordchar:
4609 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4610 PREFETCH ();
4611 if (WORDCHAR_P (d))
4612 goto fail;
4613 SET_REGS_MATCHED ();
4614 d++;
4615 break;
4616 #endif /* not emacs */
4617
4618 default:
4619 abort ();
4620 }
4621 continue; /* Successfully executed one pattern command; keep going. */
4622
4623
4624 /* We goto here if a matching operation fails. */
4625 fail:
4626 if (!FAIL_STACK_EMPTY ())
4627 { /* A restart point is known. Restore to that state. */
4628 DEBUG_PRINT1 ("\nFAIL:\n");
4629 POP_FAILURE_POINT (d, p,
4630 lowest_active_reg, highest_active_reg,
4631 regstart, regend, reg_info);
4632
4633 /* If this failure point is a dummy, try the next one. */
4634 if (!p)
4635 goto fail;
4636
4637 /* If we failed to the end of the pattern, don't examine *p. */
4638 assert (p <= pend);
4639 if (p < pend)
4640 {
4641 boolean is_a_jump_n = false;
4642
4643 /* If failed to a backwards jump that's part of a repetition
4644 loop, need to pop this failure point and use the next one. */
4645 switch ((re_opcode_t) *p)
4646 {
4647 case jump_n:
4648 is_a_jump_n = true;
4649 case maybe_pop_jump:
4650 case pop_failure_jump:
4651 case jump:
4652 p1 = p + 1;
4653 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4654 p1 += mcnt;
4655
4656 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4657 || (!is_a_jump_n
4658 && (re_opcode_t) *p1 == on_failure_jump))
4659 goto fail;
4660 break;
4661 default:
4662 /* do nothing */ ;
4663 }
4664 }
4665
4666 if (d >= string1 && d <= end1)
4667 dend = end_match_1;
4668 }
4669 else
4670 break; /* Matching at this starting point really fails. */
4671 } /* for (;;) */
4672
4673 if (best_regs_set)
4674 goto restore_best_regs;
4675
4676 FREE_VARIABLES ();
4677
4678 return -1; /* Failure to match. */
4679 } /* re_match_2 */
4680 \f
4681 /* Subroutine definitions for re_match_2. */
4682
4683
4684 /* We are passed P pointing to a register number after a start_memory.
4685
4686 Return true if the pattern up to the corresponding stop_memory can
4687 match the empty string, and false otherwise.
4688
4689 If we find the matching stop_memory, sets P to point to one past its number.
4690 Otherwise, sets P to an undefined byte less than or equal to END.
4691
4692 We don't handle duplicates properly (yet). */
4693
4694 static boolean
4695 group_match_null_string_p (p, end, reg_info)
4696 unsigned char **p, *end;
4697 register_info_type *reg_info;
4698 {
4699 int mcnt;
4700 /* Point to after the args to the start_memory. */
4701 unsigned char *p1 = *p + 2;
4702
4703 while (p1 < end)
4704 {
4705 /* Skip over opcodes that can match nothing, and return true or
4706 false, as appropriate, when we get to one that can't, or to the
4707 matching stop_memory. */
4708
4709 switch ((re_opcode_t) *p1)
4710 {
4711 /* Could be either a loop or a series of alternatives. */
4712 case on_failure_jump:
4713 p1++;
4714 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4715
4716 /* If the next operation is not a jump backwards in the
4717 pattern. */
4718
4719 if (mcnt >= 0)
4720 {
4721 /* Go through the on_failure_jumps of the alternatives,
4722 seeing if any of the alternatives cannot match nothing.
4723 The last alternative starts with only a jump,
4724 whereas the rest start with on_failure_jump and end
4725 with a jump, e.g., here is the pattern for `a|b|c':
4726
4727 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4728 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4729 /exactn/1/c
4730
4731 So, we have to first go through the first (n-1)
4732 alternatives and then deal with the last one separately. */
4733
4734
4735 /* Deal with the first (n-1) alternatives, which start
4736 with an on_failure_jump (see above) that jumps to right
4737 past a jump_past_alt. */
4738
4739 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4740 {
4741 /* `mcnt' holds how many bytes long the alternative
4742 is, including the ending `jump_past_alt' and
4743 its number. */
4744
4745 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4746 reg_info))
4747 return false;
4748
4749 /* Move to right after this alternative, including the
4750 jump_past_alt. */
4751 p1 += mcnt;
4752
4753 /* Break if it's the beginning of an n-th alternative
4754 that doesn't begin with an on_failure_jump. */
4755 if ((re_opcode_t) *p1 != on_failure_jump)
4756 break;
4757
4758 /* Still have to check that it's not an n-th
4759 alternative that starts with an on_failure_jump. */
4760 p1++;
4761 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4762 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4763 {
4764 /* Get to the beginning of the n-th alternative. */
4765 p1 -= 3;
4766 break;
4767 }
4768 }
4769
4770 /* Deal with the last alternative: go back and get number
4771 of the `jump_past_alt' just before it. `mcnt' contains
4772 the length of the alternative. */
4773 EXTRACT_NUMBER (mcnt, p1 - 2);
4774
4775 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4776 return false;
4777
4778 p1 += mcnt; /* Get past the n-th alternative. */
4779 } /* if mcnt > 0 */
4780 break;
4781
4782
4783 case stop_memory:
4784 assert (p1[1] == **p);
4785 *p = p1 + 2;
4786 return true;
4787
4788
4789 default:
4790 if (!common_op_match_null_string_p (&p1, end, reg_info))
4791 return false;
4792 }
4793 } /* while p1 < end */
4794
4795 return false;
4796 } /* group_match_null_string_p */
4797
4798
4799 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4800 It expects P to be the first byte of a single alternative and END one
4801 byte past the last. The alternative can contain groups. */
4802
4803 static boolean
4804 alt_match_null_string_p (p, end, reg_info)
4805 unsigned char *p, *end;
4806 register_info_type *reg_info;
4807 {
4808 int mcnt;
4809 unsigned char *p1 = p;
4810
4811 while (p1 < end)
4812 {
4813 /* Skip over opcodes that can match nothing, and break when we get
4814 to one that can't. */
4815
4816 switch ((re_opcode_t) *p1)
4817 {
4818 /* It's a loop. */
4819 case on_failure_jump:
4820 p1++;
4821 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4822 p1 += mcnt;
4823 break;
4824
4825 default:
4826 if (!common_op_match_null_string_p (&p1, end, reg_info))
4827 return false;
4828 }
4829 } /* while p1 < end */
4830
4831 return true;
4832 } /* alt_match_null_string_p */
4833
4834
4835 /* Deals with the ops common to group_match_null_string_p and
4836 alt_match_null_string_p.
4837
4838 Sets P to one after the op and its arguments, if any. */
4839
4840 static boolean
4841 common_op_match_null_string_p (p, end, reg_info)
4842 unsigned char **p, *end;
4843 register_info_type *reg_info;
4844 {
4845 int mcnt;
4846 boolean ret;
4847 int reg_no;
4848 unsigned char *p1 = *p;
4849
4850 switch ((re_opcode_t) *p1++)
4851 {
4852 case no_op:
4853 case begline:
4854 case endline:
4855 case begbuf:
4856 case endbuf:
4857 case wordbeg:
4858 case wordend:
4859 case wordbound:
4860 case notwordbound:
4861 #ifdef emacs
4862 case before_dot:
4863 case at_dot:
4864 case after_dot:
4865 #endif
4866 break;
4867
4868 case start_memory:
4869 reg_no = *p1;
4870 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4871 ret = group_match_null_string_p (&p1, end, reg_info);
4872
4873 /* Have to set this here in case we're checking a group which
4874 contains a group and a back reference to it. */
4875
4876 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4877 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4878
4879 if (!ret)
4880 return false;
4881 break;
4882
4883 /* If this is an optimized succeed_n for zero times, make the jump. */
4884 case jump:
4885 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4886 if (mcnt >= 0)
4887 p1 += mcnt;
4888 else
4889 return false;
4890 break;
4891
4892 case succeed_n:
4893 /* Get to the number of times to succeed. */
4894 p1 += 2;
4895 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4896
4897 if (mcnt == 0)
4898 {
4899 p1 -= 4;
4900 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4901 p1 += mcnt;
4902 }
4903 else
4904 return false;
4905 break;
4906
4907 case duplicate:
4908 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4909 return false;
4910 break;
4911
4912 case set_number_at:
4913 p1 += 4;
4914
4915 default:
4916 /* All other opcodes mean we cannot match the empty string. */
4917 return false;
4918 }
4919
4920 *p = p1;
4921 return true;
4922 } /* common_op_match_null_string_p */
4923
4924
4925 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
4926 bytes; nonzero otherwise. */
4927
4928 static int
4929 bcmp_translate (s1, s2, len, translate)
4930 unsigned char *s1, *s2;
4931 register int len;
4932 char *translate;
4933 {
4934 register unsigned char *p1 = s1, *p2 = s2;
4935 while (len)
4936 {
4937 if (translate[*p1++] != translate[*p2++]) return 1;
4938 len--;
4939 }
4940 return 0;
4941 }
4942 \f
4943 /* Entry points for GNU code. */
4944
4945 /* re_compile_pattern is the GNU regular expression compiler: it
4946 compiles PATTERN (of length SIZE) and puts the result in BUFP.
4947 Returns 0 if the pattern was valid, otherwise an error string.
4948
4949 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
4950 are set in BUFP on entry.
4951
4952 We call regex_compile to do the actual compilation. */
4953
4954 const char *
4955 re_compile_pattern (pattern, length, bufp)
4956 const char *pattern;
4957 int length;
4958 struct re_pattern_buffer *bufp;
4959 {
4960 reg_errcode_t ret;
4961
4962 /* GNU code is written to assume at least RE_NREGS registers will be set
4963 (and at least one extra will be -1). */
4964 bufp->regs_allocated = REGS_UNALLOCATED;
4965
4966 /* And GNU code determines whether or not to get register information
4967 by passing null for the REGS argument to re_match, etc., not by
4968 setting no_sub. */
4969 bufp->no_sub = 0;
4970
4971 /* Match anchors at newline. */
4972 bufp->newline_anchor = 1;
4973
4974 ret = regex_compile (pattern, length, re_syntax_options, bufp);
4975
4976 if (!ret)
4977 return NULL;
4978 return gettext (re_error_msgid[(int) ret]);
4979 }
4980 \f
4981 /* Entry points compatible with 4.2 BSD regex library. We don't define
4982 them unless specifically requested. */
4983
4984 #ifdef _REGEX_RE_COMP
4985
4986 /* BSD has one and only one pattern buffer. */
4987 static struct re_pattern_buffer re_comp_buf;
4988
4989 char *
4990 re_comp (s)
4991 const char *s;
4992 {
4993 reg_errcode_t ret;
4994
4995 if (!s)
4996 {
4997 if (!re_comp_buf.buffer)
4998 return gettext ("No previous regular expression");
4999 return 0;
5000 }
5001
5002 if (!re_comp_buf.buffer)
5003 {
5004 re_comp_buf.buffer = (unsigned char *) malloc (200);
5005 if (re_comp_buf.buffer == NULL)
5006 return gettext (re_error_msgid[(int) REG_ESPACE]);
5007 re_comp_buf.allocated = 200;
5008
5009 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5010 if (re_comp_buf.fastmap == NULL)
5011 return gettext (re_error_msgid[(int) REG_ESPACE]);
5012 }
5013
5014 /* Since `re_exec' always passes NULL for the `regs' argument, we
5015 don't need to initialize the pattern buffer fields which affect it. */
5016
5017 /* Match anchors at newlines. */
5018 re_comp_buf.newline_anchor = 1;
5019
5020 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5021
5022 if (!ret)
5023 return NULL;
5024
5025 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5026 return (char *) gettext (re_error_msgid[(int) ret]);
5027 }
5028
5029
5030 int
5031 re_exec (s)
5032 const char *s;
5033 {
5034 const int len = strlen (s);
5035 return
5036 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5037 }
5038 #endif /* _REGEX_RE_COMP */
5039 \f
5040 /* POSIX.2 functions. Don't define these for Emacs. */
5041
5042 #ifndef emacs
5043
5044 /* regcomp takes a regular expression as a string and compiles it.
5045
5046 PREG is a regex_t *. We do not expect any fields to be initialized,
5047 since POSIX says we shouldn't. Thus, we set
5048
5049 `buffer' to the compiled pattern;
5050 `used' to the length of the compiled pattern;
5051 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5052 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5053 RE_SYNTAX_POSIX_BASIC;
5054 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5055 `fastmap' and `fastmap_accurate' to zero;
5056 `re_nsub' to the number of subexpressions in PATTERN.
5057
5058 PATTERN is the address of the pattern string.
5059
5060 CFLAGS is a series of bits which affect compilation.
5061
5062 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5063 use POSIX basic syntax.
5064
5065 If REG_NEWLINE is set, then . and [^...] don't match newline.
5066 Also, regexec will try a match beginning after every newline.
5067
5068 If REG_ICASE is set, then we considers upper- and lowercase
5069 versions of letters to be equivalent when matching.
5070
5071 If REG_NOSUB is set, then when PREG is passed to regexec, that
5072 routine will report only success or failure, and nothing about the
5073 registers.
5074
5075 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5076 the return codes and their meanings.) */
5077
5078 int
5079 regcomp (preg, pattern, cflags)
5080 regex_t *preg;
5081 const char *pattern;
5082 int cflags;
5083 {
5084 reg_errcode_t ret;
5085 unsigned syntax
5086 = (cflags & REG_EXTENDED) ?
5087 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5088
5089 /* regex_compile will allocate the space for the compiled pattern. */
5090 preg->buffer = 0;
5091 preg->allocated = 0;
5092 preg->used = 0;
5093
5094 /* Don't bother to use a fastmap when searching. This simplifies the
5095 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5096 characters after newlines into the fastmap. This way, we just try
5097 every character. */
5098 preg->fastmap = 0;
5099
5100 if (cflags & REG_ICASE)
5101 {
5102 unsigned i;
5103
5104 preg->translate = (char *) malloc (CHAR_SET_SIZE);
5105 if (preg->translate == NULL)
5106 return (int) REG_ESPACE;
5107
5108 /* Map uppercase characters to corresponding lowercase ones. */
5109 for (i = 0; i < CHAR_SET_SIZE; i++)
5110 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
5111 }
5112 else
5113 preg->translate = NULL;
5114
5115 /* If REG_NEWLINE is set, newlines are treated differently. */
5116 if (cflags & REG_NEWLINE)
5117 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5118 syntax &= ~RE_DOT_NEWLINE;
5119 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
5120 /* It also changes the matching behavior. */
5121 preg->newline_anchor = 1;
5122 }
5123 else
5124 preg->newline_anchor = 0;
5125
5126 preg->no_sub = !!(cflags & REG_NOSUB);
5127
5128 /* POSIX says a null character in the pattern terminates it, so we
5129 can use strlen here in compiling the pattern. */
5130 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5131
5132 /* POSIX doesn't distinguish between an unmatched open-group and an
5133 unmatched close-group: both are REG_EPAREN. */
5134 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5135
5136 return (int) ret;
5137 }
5138
5139
5140 /* regexec searches for a given pattern, specified by PREG, in the
5141 string STRING.
5142
5143 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5144 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5145 least NMATCH elements, and we set them to the offsets of the
5146 corresponding matched substrings.
5147
5148 EFLAGS specifies `execution flags' which affect matching: if
5149 REG_NOTBOL is set, then ^ does not match at the beginning of the
5150 string; if REG_NOTEOL is set, then $ does not match at the end.
5151
5152 We return 0 if we find a match and REG_NOMATCH if not. */
5153
5154 int
5155 regexec (preg, string, nmatch, pmatch, eflags)
5156 const regex_t *preg;
5157 const char *string;
5158 size_t nmatch;
5159 regmatch_t pmatch[];
5160 int eflags;
5161 {
5162 int ret;
5163 struct re_registers regs;
5164 regex_t private_preg;
5165 int len = strlen (string);
5166 boolean want_reg_info = !preg->no_sub && nmatch > 0;
5167
5168 private_preg = *preg;
5169
5170 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5171 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5172
5173 /* The user has told us exactly how many registers to return
5174 information about, via `nmatch'. We have to pass that on to the
5175 matching routines. */
5176 private_preg.regs_allocated = REGS_FIXED;
5177
5178 if (want_reg_info)
5179 {
5180 regs.num_regs = nmatch;
5181 regs.start = TALLOC (nmatch, regoff_t);
5182 regs.end = TALLOC (nmatch, regoff_t);
5183 if (regs.start == NULL || regs.end == NULL)
5184 return (int) REG_NOMATCH;
5185 }
5186
5187 /* Perform the searching operation. */
5188 ret = re_search (&private_preg, string, len,
5189 /* start: */ 0, /* range: */ len,
5190 want_reg_info ? &regs : (struct re_registers *) 0);
5191
5192 /* Copy the register information to the POSIX structure. */
5193 if (want_reg_info)
5194 {
5195 if (ret >= 0)
5196 {
5197 unsigned r;
5198
5199 for (r = 0; r < nmatch; r++)
5200 {
5201 pmatch[r].rm_so = regs.start[r];
5202 pmatch[r].rm_eo = regs.end[r];
5203 }
5204 }
5205
5206 /* If we needed the temporary register info, free the space now. */
5207 free (regs.start);
5208 free (regs.end);
5209 }
5210
5211 /* We want zero return to mean success, unlike `re_search'. */
5212 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5213 }
5214
5215
5216 /* Returns a message corresponding to an error code, ERRCODE, returned
5217 from either regcomp or regexec. We don't use PREG here. */
5218
5219 size_t
5220 regerror (errcode, preg, errbuf, errbuf_size)
5221 int errcode;
5222 const regex_t *preg;
5223 char *errbuf;
5224 size_t errbuf_size;
5225 {
5226 const char *msg;
5227 size_t msg_size;
5228
5229 if (errcode < 0
5230 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5231 /* Only error codes returned by the rest of the code should be passed
5232 to this routine. If we are given anything else, or if other regex
5233 code generates an invalid error code, then the program has a bug.
5234 Dump core so we can fix it. */
5235 abort ();
5236
5237 msg = gettext (re_error_msgid[errcode]);
5238
5239 msg_size = strlen (msg) + 1; /* Includes the null. */
5240
5241 if (errbuf_size != 0)
5242 {
5243 if (msg_size > errbuf_size)
5244 {
5245 strncpy (errbuf, msg, errbuf_size - 1);
5246 errbuf[errbuf_size - 1] = 0;
5247 }
5248 else
5249 strcpy (errbuf, msg);
5250 }
5251
5252 return msg_size;
5253 }
5254
5255
5256 /* Free dynamically allocated space used by PREG. */
5257
5258 void
5259 regfree (preg)
5260 regex_t *preg;
5261 {
5262 if (preg->buffer != NULL)
5263 free (preg->buffer);
5264 preg->buffer = NULL;
5265
5266 preg->allocated = 0;
5267 preg->used = 0;
5268
5269 if (preg->fastmap != NULL)
5270 free (preg->fastmap);
5271 preg->fastmap = NULL;
5272 preg->fastmap_accurate = 0;
5273
5274 if (preg->translate != NULL)
5275 free (preg->translate);
5276 preg->translate = NULL;
5277 }
5278
5279 #endif /* not emacs */
5280 \f
5281 /*
5282 Local variables:
5283 make-backup-files: t
5284 version-control: t
5285 trim-versions-without-asking: nil
5286 End:
5287 */