]> code.delx.au - gnu-emacs/blob - src/regex.c
(regex_compile) <normal_char>: Pay attention to multibyteness.
[gnu-emacs] / src / regex.c
1 /* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P10003.2/D11.2, except for
3 internationalization features.)
4
5 Copyright (C) 1993,94,95,96,97,98,2000 Free Software Foundation, Inc.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA. */
21
22 /* TODO:
23 - structure the opcode space into opcode+flag.
24 - merge with glibc's regex.[ch]
25 */
26
27 /* AIX requires this to be the first thing in the file. */
28 #if defined (_AIX) && !defined (REGEX_MALLOC)
29 #pragma alloca
30 #endif
31
32 #undef _GNU_SOURCE
33 #define _GNU_SOURCE
34
35 #ifdef emacs
36 /* Converts the pointer to the char to BEG-based offset from the start. */
37 #define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
38 #define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
39 #endif
40
41 #ifdef HAVE_CONFIG_H
42 #include <config.h>
43 #endif
44
45 /* We need this for `regex.h', and perhaps for the Emacs include files. */
46 #include <sys/types.h>
47
48 /* This is for other GNU distributions with internationalized messages. */
49 #if HAVE_LIBINTL_H || defined (_LIBC)
50 # include <libintl.h>
51 #else
52 # define gettext(msgid) (msgid)
53 #endif
54
55 #ifndef gettext_noop
56 /* This define is so xgettext can find the internationalizable
57 strings. */
58 #define gettext_noop(String) String
59 #endif
60
61 /* The `emacs' switch turns on certain matching commands
62 that make sense only in Emacs. */
63 #ifdef emacs
64
65 #include "lisp.h"
66 #include "buffer.h"
67
68 /* Make syntax table lookup grant data in gl_state. */
69 #define SYNTAX_ENTRY_VIA_PROPERTY
70
71 #include "syntax.h"
72 #include "charset.h"
73 #include "category.h"
74
75 #define malloc xmalloc
76 #define realloc xrealloc
77 #define free xfree
78
79 #define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
80 #define RE_STRING_CHAR(p, s) \
81 (multibyte ? (STRING_CHAR (p, s)) : (*(p)))
82 #define RE_STRING_CHAR_AND_LENGTH(p, s, len) \
83 (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p)))
84
85 /* Set C a (possibly multibyte) character before P. P points into a
86 string which is the virtual concatenation of STR1 (which ends at
87 END1) or STR2 (which ends at END2). */
88 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
89 do { \
90 if (multibyte) \
91 { \
92 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
93 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
94 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
95 c = STRING_CHAR (dtemp, (p) - dtemp); \
96 } \
97 else \
98 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
99 } while (0)
100
101
102 #else /* not emacs */
103
104 /* If we are not linking with Emacs proper,
105 we can't use the relocating allocator
106 even if config.h says that we can. */
107 #undef REL_ALLOC
108
109 #if defined (STDC_HEADERS) || defined (_LIBC)
110 #include <stdlib.h>
111 #else
112 char *malloc ();
113 char *realloc ();
114 #endif
115
116 /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
117 If nothing else has been done, use the method below. */
118 #ifdef INHIBIT_STRING_HEADER
119 #if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
120 #if !defined (bzero) && !defined (bcopy)
121 #undef INHIBIT_STRING_HEADER
122 #endif
123 #endif
124 #endif
125
126 /* This is the normal way of making sure we have a bcopy and a bzero.
127 This is used in most programs--a few other programs avoid this
128 by defining INHIBIT_STRING_HEADER. */
129 #ifndef INHIBIT_STRING_HEADER
130 #if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
131 #include <string.h>
132 #ifndef bcmp
133 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
134 #endif
135 #ifndef bcopy
136 #define bcopy(s, d, n) memcpy ((d), (s), (n))
137 #endif
138 #ifndef bzero
139 #define bzero(s, n) memset ((s), 0, (n))
140 #endif
141 #else
142 #include <strings.h>
143 #endif
144 #endif
145
146 /* Define the syntax stuff for \<, \>, etc. */
147
148 /* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
149 enum syntaxcode { Swhitespace = 0, Sword = 1 };
150
151 #ifdef SWITCH_ENUM_BUG
152 #define SWITCH_ENUM_CAST(x) ((int)(x))
153 #else
154 #define SWITCH_ENUM_CAST(x) (x)
155 #endif
156
157 #ifdef SYNTAX_TABLE
158
159 extern char *re_syntax_table;
160
161 #else /* not SYNTAX_TABLE */
162
163 /* How many characters in the character set. */
164 #define CHAR_SET_SIZE 256
165
166 static char re_syntax_table[CHAR_SET_SIZE];
167
168 static void
169 init_syntax_once ()
170 {
171 register int c;
172 static int done = 0;
173
174 if (done)
175 return;
176
177 bzero (re_syntax_table, sizeof re_syntax_table);
178
179 for (c = 'a'; c <= 'z'; c++)
180 re_syntax_table[c] = Sword;
181
182 for (c = 'A'; c <= 'Z'; c++)
183 re_syntax_table[c] = Sword;
184
185 for (c = '0'; c <= '9'; c++)
186 re_syntax_table[c] = Sword;
187
188 re_syntax_table['_'] = Sword;
189
190 done = 1;
191 }
192
193 #endif /* not SYNTAX_TABLE */
194
195 #define SYNTAX(c) re_syntax_table[c]
196
197 /* Dummy macros for non-Emacs environments. */
198 #define BASE_LEADING_CODE_P(c) (0)
199 #define CHAR_CHARSET(c) 0
200 #define CHARSET_LEADING_CODE_BASE(c) 0
201 #define MAX_MULTIBYTE_LENGTH 1
202 #define RE_MULTIBYTE_P(x) 0
203 #define WORD_BOUNDARY_P(c1, c2) (0)
204 #define CHAR_HEAD_P(p) (1)
205 #define SINGLE_BYTE_CHAR_P(c) (1)
206 #define SAME_CHARSET_P(c1, c2) (1)
207 #define MULTIBYTE_FORM_LENGTH(p, s) (1)
208 #define STRING_CHAR(p, s) (*(p))
209 #define RE_STRING_CHAR STRING_CHAR
210 #define CHAR_STRING(c, s) (*(s) = (c), 1)
211 #define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p))
212 #define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH
213 #define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
214 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
215 #define MAKE_CHAR(charset, c1, c2) (c1)
216 #endif /* not emacs */
217
218 #ifndef RE_TRANSLATE
219 #define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
220 #define RE_TRANSLATE_P(TBL) (TBL)
221 #endif
222 \f
223 /* Get the interface, including the syntax bits. */
224 #include "regex.h"
225
226 /* isalpha etc. are used for the character classes. */
227 #include <ctype.h>
228
229 #ifdef emacs
230
231 /* 1 if C is an ASCII character. */
232 #define IS_REAL_ASCII(c) ((c) < 0200)
233
234 /* 1 if C is a unibyte character. */
235 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
236
237 /* The Emacs definitions should not be directly affected by locales. */
238
239 /* In Emacs, these are only used for single-byte characters. */
240 #define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
241 #define ISCNTRL(c) ((c) < ' ')
242 #define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
243 || ((c) >= 'a' && (c) <= 'f') \
244 || ((c) >= 'A' && (c) <= 'F'))
245
246 /* This is only used for single-byte characters. */
247 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
248
249 /* The rest must handle multibyte characters. */
250
251 #define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
252 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
253 : 1)
254
255 #define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
256 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
257 : 1)
258
259 #define ISALNUM(c) (IS_REAL_ASCII (c) \
260 ? (((c) >= 'a' && (c) <= 'z') \
261 || ((c) >= 'A' && (c) <= 'Z') \
262 || ((c) >= '0' && (c) <= '9')) \
263 : SYNTAX (c) == Sword)
264
265 #define ISALPHA(c) (IS_REAL_ASCII (c) \
266 ? (((c) >= 'a' && (c) <= 'z') \
267 || ((c) >= 'A' && (c) <= 'Z')) \
268 : SYNTAX (c) == Sword)
269
270 #define ISLOWER(c) (LOWERCASEP (c))
271
272 #define ISPUNCT(c) (IS_REAL_ASCII (c) \
273 ? ((c) > ' ' && (c) < 0177 \
274 && !(((c) >= 'a' && (c) <= 'z') \
275 || ((c) >= 'A' && (c) <= 'Z') \
276 || ((c) >= '0' && (c) <= '9'))) \
277 : SYNTAX (c) != Sword)
278
279 #define ISSPACE(c) (SYNTAX (c) == Swhitespace)
280
281 #define ISUPPER(c) (UPPERCASEP (c))
282
283 #define ISWORD(c) (SYNTAX (c) == Sword)
284
285 #else /* not emacs */
286
287 /* Jim Meyering writes:
288
289 "... Some ctype macros are valid only for character codes that
290 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
291 using /bin/cc or gcc but without giving an ansi option). So, all
292 ctype uses should be through macros like ISPRINT... If
293 STDC_HEADERS is defined, then autoconf has verified that the ctype
294 macros don't need to be guarded with references to isascii. ...
295 Defining isascii to 1 should let any compiler worth its salt
296 eliminate the && through constant folding." */
297
298 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
299 #define ISASCII(c) 1
300 #else
301 #define ISASCII(c) isascii(c)
302 #endif
303
304 /* 1 if C is an ASCII character. */
305 #define IS_REAL_ASCII(c) ((c) < 0200)
306
307 /* This distinction is not meaningful, except in Emacs. */
308 #define ISUNIBYTE(c) 1
309
310 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
311 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
312 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
313
314 #ifdef isblank
315 #define ISBLANK(c) (ISASCII (c) && isblank (c))
316 #else
317 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
318 #endif
319 #ifdef isgraph
320 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
321 #else
322 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
323 #endif
324
325 #define ISPRINT(c) (ISASCII (c) && isprint (c))
326 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
327 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
328 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
329 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
330 #define ISLOWER(c) (ISASCII (c) && islower (c))
331 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
332 #define ISSPACE(c) (ISASCII (c) && isspace (c))
333 #define ISUPPER(c) (ISASCII (c) && isupper (c))
334 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
335
336 #define ISWORD(c) ISALPHA(c)
337
338 #endif /* not emacs */
339 \f
340 #ifndef NULL
341 #define NULL (void *)0
342 #endif
343
344 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
345 since ours (we hope) works properly with all combinations of
346 machines, compilers, `char' and `unsigned char' argument types.
347 (Per Bothner suggested the basic approach.) */
348 #undef SIGN_EXTEND_CHAR
349 #if __STDC__
350 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
351 #else /* not __STDC__ */
352 /* As in Harbison and Steele. */
353 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
354 #endif
355 \f
356 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
357 use `alloca' instead of `malloc'. This is because using malloc in
358 re_search* or re_match* could cause memory leaks when C-g is used in
359 Emacs; also, malloc is slower and causes storage fragmentation. On
360 the other hand, malloc is more portable, and easier to debug.
361
362 Because we sometimes use alloca, some routines have to be macros,
363 not functions -- `alloca'-allocated space disappears at the end of the
364 function it is called in. */
365
366 #ifdef REGEX_MALLOC
367
368 #define REGEX_ALLOCATE malloc
369 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
370 #define REGEX_FREE free
371
372 #else /* not REGEX_MALLOC */
373
374 /* Emacs already defines alloca, sometimes. */
375 #ifndef alloca
376
377 /* Make alloca work the best possible way. */
378 #ifdef __GNUC__
379 #define alloca __builtin_alloca
380 #else /* not __GNUC__ */
381 #if HAVE_ALLOCA_H
382 #include <alloca.h>
383 #else /* not __GNUC__ or HAVE_ALLOCA_H */
384 #if 0 /* It is a bad idea to declare alloca. We always cast the result. */
385 #ifndef _AIX /* Already did AIX, up at the top. */
386 char *alloca ();
387 #endif /* not _AIX */
388 #endif
389 #endif /* not HAVE_ALLOCA_H */
390 #endif /* not __GNUC__ */
391
392 #endif /* not alloca */
393
394 #define REGEX_ALLOCATE alloca
395
396 /* Assumes a `char *destination' variable. */
397 #define REGEX_REALLOCATE(source, osize, nsize) \
398 (destination = (char *) alloca (nsize), \
399 bcopy (source, destination, osize), \
400 destination)
401
402 /* No need to do anything to free, after alloca. */
403 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
404
405 #endif /* not REGEX_MALLOC */
406
407 /* Define how to allocate the failure stack. */
408
409 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
410
411 #define REGEX_ALLOCATE_STACK(size) \
412 r_alloc (&failure_stack_ptr, (size))
413 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
414 r_re_alloc (&failure_stack_ptr, (nsize))
415 #define REGEX_FREE_STACK(ptr) \
416 r_alloc_free (&failure_stack_ptr)
417
418 #else /* not using relocating allocator */
419
420 #ifdef REGEX_MALLOC
421
422 #define REGEX_ALLOCATE_STACK malloc
423 #define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
424 #define REGEX_FREE_STACK free
425
426 #else /* not REGEX_MALLOC */
427
428 #define REGEX_ALLOCATE_STACK alloca
429
430 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
431 REGEX_REALLOCATE (source, osize, nsize)
432 /* No need to explicitly free anything. */
433 #define REGEX_FREE_STACK(arg) ((void)0)
434
435 #endif /* not REGEX_MALLOC */
436 #endif /* not using relocating allocator */
437
438
439 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
440 `string1' or just past its end. This works if PTR is NULL, which is
441 a good thing. */
442 #define FIRST_STRING_P(ptr) \
443 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
444
445 /* (Re)Allocate N items of type T using malloc, or fail. */
446 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
447 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
448 #define RETALLOC_IF(addr, n, t) \
449 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
450 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
451
452 #define BYTEWIDTH 8 /* In bits. */
453
454 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
455
456 #undef MAX
457 #undef MIN
458 #define MAX(a, b) ((a) > (b) ? (a) : (b))
459 #define MIN(a, b) ((a) < (b) ? (a) : (b))
460
461 /* Type of source-pattern and string chars. */
462 typedef const unsigned char re_char;
463
464 typedef char boolean;
465 #define false 0
466 #define true 1
467
468 static int re_match_2_internal ();
469 \f
470 /* These are the command codes that appear in compiled regular
471 expressions. Some opcodes are followed by argument bytes. A
472 command code can specify any interpretation whatsoever for its
473 arguments. Zero bytes may appear in the compiled regular expression. */
474
475 typedef enum
476 {
477 no_op = 0,
478
479 /* Succeed right away--no more backtracking. */
480 succeed,
481
482 /* Followed by one byte giving n, then by n literal bytes. */
483 exactn,
484
485 /* Matches any (more or less) character. */
486 anychar,
487
488 /* Matches any one char belonging to specified set. First
489 following byte is number of bitmap bytes. Then come bytes
490 for a bitmap saying which chars are in. Bits in each byte
491 are ordered low-bit-first. A character is in the set if its
492 bit is 1. A character too large to have a bit in the map is
493 automatically not in the set.
494
495 If the length byte has the 0x80 bit set, then that stuff
496 is followed by a range table:
497 2 bytes of flags for character sets (low 8 bits, high 8 bits)
498 See RANGE_TABLE_WORK_BITS below.
499 2 bytes, the number of pairs that follow
500 pairs, each 2 multibyte characters,
501 each multibyte character represented as 3 bytes. */
502 charset,
503
504 /* Same parameters as charset, but match any character that is
505 not one of those specified. */
506 charset_not,
507
508 /* Start remembering the text that is matched, for storing in a
509 register. Followed by one byte with the register number, in
510 the range 0 to one less than the pattern buffer's re_nsub
511 field. */
512 start_memory,
513
514 /* Stop remembering the text that is matched and store it in a
515 memory register. Followed by one byte with the register
516 number, in the range 0 to one less than `re_nsub' in the
517 pattern buffer. */
518 stop_memory,
519
520 /* Match a duplicate of something remembered. Followed by one
521 byte containing the register number. */
522 duplicate,
523
524 /* Fail unless at beginning of line. */
525 begline,
526
527 /* Fail unless at end of line. */
528 endline,
529
530 /* Succeeds if at beginning of buffer (if emacs) or at beginning
531 of string to be matched (if not). */
532 begbuf,
533
534 /* Analogously, for end of buffer/string. */
535 endbuf,
536
537 /* Followed by two byte relative address to which to jump. */
538 jump,
539
540 /* Followed by two-byte relative address of place to resume at
541 in case of failure. */
542 on_failure_jump,
543
544 /* Like on_failure_jump, but pushes a placeholder instead of the
545 current string position when executed. */
546 on_failure_keep_string_jump,
547
548 /* Just like `on_failure_jump', except that it checks that we
549 don't get stuck in an infinite loop (matching an empty string
550 indefinitely). */
551 on_failure_jump_loop,
552
553 /* Just like `on_failure_jump_loop', except that it checks for
554 a different kind of loop (the kind that shows up with non-greedy
555 operators). This operation has to be immediately preceded
556 by a `no_op'. */
557 on_failure_jump_nastyloop,
558
559 /* A smart `on_failure_jump' used for greedy * and + operators.
560 It analyses the loop before which it is put and if the
561 loop does not require backtracking, it changes itself to
562 `on_failure_keep_string_jump' and short-circuits the loop,
563 else it just defaults to changing itself into `on_failure_jump'.
564 It assumes that it is pointing to just past a `jump'. */
565 on_failure_jump_smart,
566
567 /* Followed by two-byte relative address and two-byte number n.
568 After matching N times, jump to the address upon failure.
569 Does not work if N starts at 0: use on_failure_jump_loop
570 instead. */
571 succeed_n,
572
573 /* Followed by two-byte relative address, and two-byte number n.
574 Jump to the address N times, then fail. */
575 jump_n,
576
577 /* Set the following two-byte relative address to the
578 subsequent two-byte number. The address *includes* the two
579 bytes of number. */
580 set_number_at,
581
582 wordbeg, /* Succeeds if at word beginning. */
583 wordend, /* Succeeds if at word end. */
584
585 wordbound, /* Succeeds if at a word boundary. */
586 notwordbound, /* Succeeds if not at a word boundary. */
587
588 /* Matches any character whose syntax is specified. Followed by
589 a byte which contains a syntax code, e.g., Sword. */
590 syntaxspec,
591
592 /* Matches any character whose syntax is not that specified. */
593 notsyntaxspec
594
595 #ifdef emacs
596 ,before_dot, /* Succeeds if before point. */
597 at_dot, /* Succeeds if at point. */
598 after_dot, /* Succeeds if after point. */
599
600 /* Matches any character whose category-set contains the specified
601 category. The operator is followed by a byte which contains a
602 category code (mnemonic ASCII character). */
603 categoryspec,
604
605 /* Matches any character whose category-set does not contain the
606 specified category. The operator is followed by a byte which
607 contains the category code (mnemonic ASCII character). */
608 notcategoryspec
609 #endif /* emacs */
610 } re_opcode_t;
611 \f
612 /* Common operations on the compiled pattern. */
613
614 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
615
616 #define STORE_NUMBER(destination, number) \
617 do { \
618 (destination)[0] = (number) & 0377; \
619 (destination)[1] = (number) >> 8; \
620 } while (0)
621
622 /* Same as STORE_NUMBER, except increment DESTINATION to
623 the byte after where the number is stored. Therefore, DESTINATION
624 must be an lvalue. */
625
626 #define STORE_NUMBER_AND_INCR(destination, number) \
627 do { \
628 STORE_NUMBER (destination, number); \
629 (destination) += 2; \
630 } while (0)
631
632 /* Put into DESTINATION a number stored in two contiguous bytes starting
633 at SOURCE. */
634
635 #define EXTRACT_NUMBER(destination, source) \
636 do { \
637 (destination) = *(source) & 0377; \
638 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
639 } while (0)
640
641 #ifdef DEBUG
642 static void
643 extract_number (dest, source)
644 int *dest;
645 unsigned char *source;
646 {
647 int temp = SIGN_EXTEND_CHAR (*(source + 1));
648 *dest = *source & 0377;
649 *dest += temp << 8;
650 }
651
652 #ifndef EXTRACT_MACROS /* To debug the macros. */
653 #undef EXTRACT_NUMBER
654 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
655 #endif /* not EXTRACT_MACROS */
656
657 #endif /* DEBUG */
658
659 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
660 SOURCE must be an lvalue. */
661
662 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
663 do { \
664 EXTRACT_NUMBER (destination, source); \
665 (source) += 2; \
666 } while (0)
667
668 #ifdef DEBUG
669 static void
670 extract_number_and_incr (destination, source)
671 int *destination;
672 unsigned char **source;
673 {
674 extract_number (destination, *source);
675 *source += 2;
676 }
677
678 #ifndef EXTRACT_MACROS
679 #undef EXTRACT_NUMBER_AND_INCR
680 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
681 extract_number_and_incr (&dest, &src)
682 #endif /* not EXTRACT_MACROS */
683
684 #endif /* DEBUG */
685 \f
686 /* Store a multibyte character in three contiguous bytes starting
687 DESTINATION, and increment DESTINATION to the byte after where the
688 character is stored. Therefore, DESTINATION must be an lvalue. */
689
690 #define STORE_CHARACTER_AND_INCR(destination, character) \
691 do { \
692 (destination)[0] = (character) & 0377; \
693 (destination)[1] = ((character) >> 8) & 0377; \
694 (destination)[2] = (character) >> 16; \
695 (destination) += 3; \
696 } while (0)
697
698 /* Put into DESTINATION a character stored in three contiguous bytes
699 starting at SOURCE. */
700
701 #define EXTRACT_CHARACTER(destination, source) \
702 do { \
703 (destination) = ((source)[0] \
704 | ((source)[1] << 8) \
705 | ((source)[2] << 16)); \
706 } while (0)
707
708
709 /* Macros for charset. */
710
711 /* Size of bitmap of charset P in bytes. P is a start of charset,
712 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
713 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
714
715 /* Nonzero if charset P has range table. */
716 #define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
717
718 /* Return the address of range table of charset P. But not the start
719 of table itself, but the before where the number of ranges is
720 stored. `2 +' means to skip re_opcode_t and size of bitmap,
721 and the 2 bytes of flags at the start of the range table. */
722 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
723
724 /* Extract the bit flags that start a range table. */
725 #define CHARSET_RANGE_TABLE_BITS(p) \
726 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
727 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
728
729 /* Test if C is listed in the bitmap of charset P. */
730 #define CHARSET_LOOKUP_BITMAP(p, c) \
731 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
732 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
733
734 /* Return the address of end of RANGE_TABLE. COUNT is number of
735 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
736 is start of range and end of range. `* 3' is size of each start
737 and end. */
738 #define CHARSET_RANGE_TABLE_END(range_table, count) \
739 ((range_table) + (count) * 2 * 3)
740
741 /* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
742 COUNT is number of ranges in RANGE_TABLE. */
743 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
744 do \
745 { \
746 int range_start, range_end; \
747 unsigned char *p; \
748 unsigned char *range_table_end \
749 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
750 \
751 for (p = (range_table); p < range_table_end; p += 2 * 3) \
752 { \
753 EXTRACT_CHARACTER (range_start, p); \
754 EXTRACT_CHARACTER (range_end, p + 3); \
755 \
756 if (range_start <= (c) && (c) <= range_end) \
757 { \
758 (not) = !(not); \
759 break; \
760 } \
761 } \
762 } \
763 while (0)
764
765 /* Test if C is in range table of CHARSET. The flag NOT is negated if
766 C is listed in it. */
767 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
768 do \
769 { \
770 /* Number of ranges in range table. */ \
771 int count; \
772 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \
773 \
774 EXTRACT_NUMBER_AND_INCR (count, range_table); \
775 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
776 } \
777 while (0)
778 \f
779 /* If DEBUG is defined, Regex prints many voluminous messages about what
780 it is doing (if the variable `debug' is nonzero). If linked with the
781 main program in `iregex.c', you can enter patterns and strings
782 interactively. And if linked with the main program in `main.c' and
783 the other test files, you can run the already-written tests. */
784
785 #ifdef DEBUG
786
787 /* We use standard I/O for debugging. */
788 #include <stdio.h>
789
790 /* It is useful to test things that ``must'' be true when debugging. */
791 #include <assert.h>
792
793 static int debug = -100000;
794
795 #define DEBUG_STATEMENT(e) e
796 #define DEBUG_PRINT1(x) if (debug > 0) printf (x)
797 #define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
798 #define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
799 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
800 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
801 if (debug > 0) print_partial_compiled_pattern (s, e)
802 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
803 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
804
805
806 /* Print the fastmap in human-readable form. */
807
808 void
809 print_fastmap (fastmap)
810 char *fastmap;
811 {
812 unsigned was_a_range = 0;
813 unsigned i = 0;
814
815 while (i < (1 << BYTEWIDTH))
816 {
817 if (fastmap[i++])
818 {
819 was_a_range = 0;
820 putchar (i - 1);
821 while (i < (1 << BYTEWIDTH) && fastmap[i])
822 {
823 was_a_range = 1;
824 i++;
825 }
826 if (was_a_range)
827 {
828 printf ("-");
829 putchar (i - 1);
830 }
831 }
832 }
833 putchar ('\n');
834 }
835
836
837 /* Print a compiled pattern string in human-readable form, starting at
838 the START pointer into it and ending just before the pointer END. */
839
840 void
841 print_partial_compiled_pattern (start, end)
842 unsigned char *start;
843 unsigned char *end;
844 {
845 int mcnt, mcnt2;
846 unsigned char *p = start;
847 unsigned char *pend = end;
848
849 if (start == NULL)
850 {
851 printf ("(null)\n");
852 return;
853 }
854
855 /* Loop over pattern commands. */
856 while (p < pend)
857 {
858 printf ("%d:\t", p - start);
859
860 switch ((re_opcode_t) *p++)
861 {
862 case no_op:
863 printf ("/no_op");
864 break;
865
866 case succeed:
867 printf ("/succeed");
868 break;
869
870 case exactn:
871 mcnt = *p++;
872 printf ("/exactn/%d", mcnt);
873 do
874 {
875 putchar ('/');
876 putchar (*p++);
877 }
878 while (--mcnt);
879 break;
880
881 case start_memory:
882 printf ("/start_memory/%d", *p++);
883 break;
884
885 case stop_memory:
886 printf ("/stop_memory/%d", *p++);
887 break;
888
889 case duplicate:
890 printf ("/duplicate/%d", *p++);
891 break;
892
893 case anychar:
894 printf ("/anychar");
895 break;
896
897 case charset:
898 case charset_not:
899 {
900 register int c, last = -100;
901 register int in_range = 0;
902 int length = CHARSET_BITMAP_SIZE (p - 1);
903 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
904
905 printf ("/charset [%s",
906 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
907
908 assert (p + *p < pend);
909
910 for (c = 0; c < 256; c++)
911 if (c / 8 < length
912 && (p[1 + (c/8)] & (1 << (c % 8))))
913 {
914 /* Are we starting a range? */
915 if (last + 1 == c && ! in_range)
916 {
917 putchar ('-');
918 in_range = 1;
919 }
920 /* Have we broken a range? */
921 else if (last + 1 != c && in_range)
922 {
923 putchar (last);
924 in_range = 0;
925 }
926
927 if (! in_range)
928 putchar (c);
929
930 last = c;
931 }
932
933 if (in_range)
934 putchar (last);
935
936 putchar (']');
937
938 p += 1 + length;
939
940 if (has_range_table)
941 {
942 int count;
943 printf ("has-range-table");
944
945 /* ??? Should print the range table; for now, just skip it. */
946 p += 2; /* skip range table bits */
947 EXTRACT_NUMBER_AND_INCR (count, p);
948 p = CHARSET_RANGE_TABLE_END (p, count);
949 }
950 }
951 break;
952
953 case begline:
954 printf ("/begline");
955 break;
956
957 case endline:
958 printf ("/endline");
959 break;
960
961 case on_failure_jump:
962 extract_number_and_incr (&mcnt, &p);
963 printf ("/on_failure_jump to %d", p + mcnt - start);
964 break;
965
966 case on_failure_keep_string_jump:
967 extract_number_and_incr (&mcnt, &p);
968 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
969 break;
970
971 case on_failure_jump_nastyloop:
972 extract_number_and_incr (&mcnt, &p);
973 printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start);
974 break;
975
976 case on_failure_jump_loop:
977 extract_number_and_incr (&mcnt, &p);
978 printf ("/on_failure_jump_loop to %d", p + mcnt - start);
979 break;
980
981 case on_failure_jump_smart:
982 extract_number_and_incr (&mcnt, &p);
983 printf ("/on_failure_jump_smart to %d", p + mcnt - start);
984 break;
985
986 case jump:
987 extract_number_and_incr (&mcnt, &p);
988 printf ("/jump to %d", p + mcnt - start);
989 break;
990
991 case succeed_n:
992 extract_number_and_incr (&mcnt, &p);
993 extract_number_and_incr (&mcnt2, &p);
994 printf ("/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
995 break;
996
997 case jump_n:
998 extract_number_and_incr (&mcnt, &p);
999 extract_number_and_incr (&mcnt2, &p);
1000 printf ("/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
1001 break;
1002
1003 case set_number_at:
1004 extract_number_and_incr (&mcnt, &p);
1005 extract_number_and_incr (&mcnt2, &p);
1006 printf ("/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
1007 break;
1008
1009 case wordbound:
1010 printf ("/wordbound");
1011 break;
1012
1013 case notwordbound:
1014 printf ("/notwordbound");
1015 break;
1016
1017 case wordbeg:
1018 printf ("/wordbeg");
1019 break;
1020
1021 case wordend:
1022 printf ("/wordend");
1023
1024 case syntaxspec:
1025 printf ("/syntaxspec");
1026 mcnt = *p++;
1027 printf ("/%d", mcnt);
1028 break;
1029
1030 case notsyntaxspec:
1031 printf ("/notsyntaxspec");
1032 mcnt = *p++;
1033 printf ("/%d", mcnt);
1034 break;
1035
1036 #ifdef emacs
1037 case before_dot:
1038 printf ("/before_dot");
1039 break;
1040
1041 case at_dot:
1042 printf ("/at_dot");
1043 break;
1044
1045 case after_dot:
1046 printf ("/after_dot");
1047 break;
1048
1049 case categoryspec:
1050 printf ("/categoryspec");
1051 mcnt = *p++;
1052 printf ("/%d", mcnt);
1053 break;
1054
1055 case notcategoryspec:
1056 printf ("/notcategoryspec");
1057 mcnt = *p++;
1058 printf ("/%d", mcnt);
1059 break;
1060 #endif /* emacs */
1061
1062 case begbuf:
1063 printf ("/begbuf");
1064 break;
1065
1066 case endbuf:
1067 printf ("/endbuf");
1068 break;
1069
1070 default:
1071 printf ("?%d", *(p-1));
1072 }
1073
1074 putchar ('\n');
1075 }
1076
1077 printf ("%d:\tend of pattern.\n", p - start);
1078 }
1079
1080
1081 void
1082 print_compiled_pattern (bufp)
1083 struct re_pattern_buffer *bufp;
1084 {
1085 unsigned char *buffer = bufp->buffer;
1086
1087 print_partial_compiled_pattern (buffer, buffer + bufp->used);
1088 printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used, bufp->allocated);
1089
1090 if (bufp->fastmap_accurate && bufp->fastmap)
1091 {
1092 printf ("fastmap: ");
1093 print_fastmap (bufp->fastmap);
1094 }
1095
1096 printf ("re_nsub: %d\t", bufp->re_nsub);
1097 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1098 printf ("can_be_null: %d\t", bufp->can_be_null);
1099 printf ("newline_anchor: %d\n", bufp->newline_anchor);
1100 printf ("no_sub: %d\t", bufp->no_sub);
1101 printf ("not_bol: %d\t", bufp->not_bol);
1102 printf ("not_eol: %d\t", bufp->not_eol);
1103 printf ("syntax: %d\n", bufp->syntax);
1104 fflush (stdout);
1105 /* Perhaps we should print the translate table? */
1106 }
1107
1108
1109 void
1110 print_double_string (where, string1, size1, string2, size2)
1111 re_char *where;
1112 re_char *string1;
1113 re_char *string2;
1114 int size1;
1115 int size2;
1116 {
1117 unsigned this_char;
1118
1119 if (where == NULL)
1120 printf ("(null)");
1121 else
1122 {
1123 if (FIRST_STRING_P (where))
1124 {
1125 for (this_char = where - string1; this_char < size1; this_char++)
1126 putchar (string1[this_char]);
1127
1128 where = string2;
1129 }
1130
1131 for (this_char = where - string2; this_char < size2; this_char++)
1132 putchar (string2[this_char]);
1133 }
1134 }
1135
1136 #else /* not DEBUG */
1137
1138 #undef assert
1139 #define assert(e)
1140
1141 #define DEBUG_STATEMENT(e)
1142 #define DEBUG_PRINT1(x)
1143 #define DEBUG_PRINT2(x1, x2)
1144 #define DEBUG_PRINT3(x1, x2, x3)
1145 #define DEBUG_PRINT4(x1, x2, x3, x4)
1146 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1147 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1148
1149 #endif /* not DEBUG */
1150 \f
1151 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1152 also be assigned to arbitrarily: each pattern buffer stores its own
1153 syntax, so it can be changed between regex compilations. */
1154 /* This has no initializer because initialized variables in Emacs
1155 become read-only after dumping. */
1156 reg_syntax_t re_syntax_options;
1157
1158
1159 /* Specify the precise syntax of regexps for compilation. This provides
1160 for compatibility for various utilities which historically have
1161 different, incompatible syntaxes.
1162
1163 The argument SYNTAX is a bit mask comprised of the various bits
1164 defined in regex.h. We return the old syntax. */
1165
1166 reg_syntax_t
1167 re_set_syntax (syntax)
1168 reg_syntax_t syntax;
1169 {
1170 reg_syntax_t ret = re_syntax_options;
1171
1172 re_syntax_options = syntax;
1173 return ret;
1174 }
1175 \f
1176 /* This table gives an error message for each of the error codes listed
1177 in regex.h. Obviously the order here has to be same as there.
1178 POSIX doesn't require that we do anything for REG_NOERROR,
1179 but why not be nice? */
1180
1181 static const char *re_error_msgid[] =
1182 {
1183 gettext_noop ("Success"), /* REG_NOERROR */
1184 gettext_noop ("No match"), /* REG_NOMATCH */
1185 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1186 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1187 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1188 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1189 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1190 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1191 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1192 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1193 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1194 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1195 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1196 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1197 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1198 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1199 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1200 };
1201 \f
1202 /* Avoiding alloca during matching, to placate r_alloc. */
1203
1204 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1205 searching and matching functions should not call alloca. On some
1206 systems, alloca is implemented in terms of malloc, and if we're
1207 using the relocating allocator routines, then malloc could cause a
1208 relocation, which might (if the strings being searched are in the
1209 ralloc heap) shift the data out from underneath the regexp
1210 routines.
1211
1212 Here's another reason to avoid allocation: Emacs
1213 processes input from X in a signal handler; processing X input may
1214 call malloc; if input arrives while a matching routine is calling
1215 malloc, then we're scrod. But Emacs can't just block input while
1216 calling matching routines; then we don't notice interrupts when
1217 they come in. So, Emacs blocks input around all regexp calls
1218 except the matching calls, which it leaves unprotected, in the
1219 faith that they will not malloc. */
1220
1221 /* Normally, this is fine. */
1222 #define MATCH_MAY_ALLOCATE
1223
1224 /* When using GNU C, we are not REALLY using the C alloca, no matter
1225 what config.h may say. So don't take precautions for it. */
1226 #ifdef __GNUC__
1227 #undef C_ALLOCA
1228 #endif
1229
1230 /* The match routines may not allocate if (1) they would do it with malloc
1231 and (2) it's not safe for them to use malloc.
1232 Note that if REL_ALLOC is defined, matching would not use malloc for the
1233 failure stack, but we would still use it for the register vectors;
1234 so REL_ALLOC should not affect this. */
1235 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1236 #undef MATCH_MAY_ALLOCATE
1237 #endif
1238
1239 \f
1240 /* Failure stack declarations and macros; both re_compile_fastmap and
1241 re_match_2 use a failure stack. These have to be macros because of
1242 REGEX_ALLOCATE_STACK. */
1243
1244
1245 /* Approximate number of failure points for which to initially allocate space
1246 when matching. If this number is exceeded, we allocate more
1247 space, so it is not a hard limit. */
1248 #ifndef INIT_FAILURE_ALLOC
1249 #define INIT_FAILURE_ALLOC 20
1250 #endif
1251
1252 /* Roughly the maximum number of failure points on the stack. Would be
1253 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1254 This is a variable only so users of regex can assign to it; we never
1255 change it ourselves. */
1256 #if defined (MATCH_MAY_ALLOCATE)
1257 /* Note that 4400 is enough to cause a crash on Alpha OSF/1,
1258 whose default stack limit is 2mb. In order for a larger
1259 value to work reliably, you have to try to make it accord
1260 with the process stack limit. */
1261 int re_max_failures = 40000;
1262 #else
1263 int re_max_failures = 4000;
1264 #endif
1265
1266 union fail_stack_elt
1267 {
1268 const unsigned char *pointer;
1269 unsigned int integer;
1270 };
1271
1272 typedef union fail_stack_elt fail_stack_elt_t;
1273
1274 typedef struct
1275 {
1276 fail_stack_elt_t *stack;
1277 unsigned size;
1278 unsigned avail; /* Offset of next open position. */
1279 unsigned frame; /* Offset of the cur constructed frame. */
1280 } fail_stack_type;
1281
1282 #define PATTERN_STACK_EMPTY() (fail_stack.avail == 0)
1283 #define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
1284 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1285
1286
1287 /* Define macros to initialize and free the failure stack.
1288 Do `return -2' if the alloc fails. */
1289
1290 #ifdef MATCH_MAY_ALLOCATE
1291 #define INIT_FAIL_STACK() \
1292 do { \
1293 fail_stack.stack = (fail_stack_elt_t *) \
1294 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1295 * sizeof (fail_stack_elt_t)); \
1296 \
1297 if (fail_stack.stack == NULL) \
1298 return -2; \
1299 \
1300 fail_stack.size = INIT_FAILURE_ALLOC; \
1301 fail_stack.avail = 0; \
1302 fail_stack.frame = 0; \
1303 } while (0)
1304
1305 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1306 #else
1307 #define INIT_FAIL_STACK() \
1308 do { \
1309 fail_stack.avail = 0; \
1310 fail_stack.frame = 0; \
1311 } while (0)
1312
1313 #define RESET_FAIL_STACK() ((void)0)
1314 #endif
1315
1316
1317 /* Double the size of FAIL_STACK, up to a limit
1318 which allows approximately `re_max_failures' items.
1319
1320 Return 1 if succeeds, and 0 if either ran out of memory
1321 allocating space for it or it was already too large.
1322
1323 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1324
1325 /* Factor to increase the failure stack size by
1326 when we increase it.
1327 This used to be 2, but 2 was too wasteful
1328 because the old discarded stacks added up to as much space
1329 were as ultimate, maximum-size stack. */
1330 #define FAIL_STACK_GROWTH_FACTOR 4
1331
1332 #define GROW_FAIL_STACK(fail_stack) \
1333 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1334 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
1335 ? 0 \
1336 : ((fail_stack).stack \
1337 = (fail_stack_elt_t *) \
1338 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1339 (fail_stack).size * sizeof (fail_stack_elt_t), \
1340 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1341 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1342 * FAIL_STACK_GROWTH_FACTOR))), \
1343 \
1344 (fail_stack).stack == NULL \
1345 ? 0 \
1346 : ((fail_stack).size \
1347 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1348 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1349 * FAIL_STACK_GROWTH_FACTOR)) \
1350 / sizeof (fail_stack_elt_t)), \
1351 1)))
1352
1353
1354 /* Push pointer POINTER on FAIL_STACK.
1355 Return 1 if was able to do so and 0 if ran out of memory allocating
1356 space to do so. */
1357 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1358 ((FAIL_STACK_FULL () \
1359 && !GROW_FAIL_STACK (FAIL_STACK)) \
1360 ? 0 \
1361 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1362 1))
1363 #define POP_PATTERN_OP() POP_FAILURE_POINTER ()
1364
1365 /* Push a pointer value onto the failure stack.
1366 Assumes the variable `fail_stack'. Probably should only
1367 be called from within `PUSH_FAILURE_POINT'. */
1368 #define PUSH_FAILURE_POINTER(item) \
1369 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)
1370
1371 /* This pushes an integer-valued item onto the failure stack.
1372 Assumes the variable `fail_stack'. Probably should only
1373 be called from within `PUSH_FAILURE_POINT'. */
1374 #define PUSH_FAILURE_INT(item) \
1375 fail_stack.stack[fail_stack.avail++].integer = (item)
1376
1377 /* Push a fail_stack_elt_t value onto the failure stack.
1378 Assumes the variable `fail_stack'. Probably should only
1379 be called from within `PUSH_FAILURE_POINT'. */
1380 #define PUSH_FAILURE_ELT(item) \
1381 fail_stack.stack[fail_stack.avail++] = (item)
1382
1383 /* These three POP... operations complement the three PUSH... operations.
1384 All assume that `fail_stack' is nonempty. */
1385 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1386 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1387 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1388
1389 /* Individual items aside from the registers. */
1390 #define NUM_NONREG_ITEMS 3
1391
1392 /* Used to examine the stack (to detect infinite loops). */
1393 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
1394 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
1395 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1396 #define TOP_FAILURE_HANDLE() fail_stack.frame
1397
1398
1399 #define ENSURE_FAIL_STACK(space) \
1400 while (REMAINING_AVAIL_SLOTS <= space) { \
1401 if (!GROW_FAIL_STACK (fail_stack)) \
1402 return -2; \
1403 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1404 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1405 }
1406
1407 /* Push register NUM onto the stack. */
1408 #define PUSH_FAILURE_REG(num) \
1409 do { \
1410 char *destination; \
1411 ENSURE_FAIL_STACK(3); \
1412 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1413 num, regstart[num], regend[num]); \
1414 PUSH_FAILURE_POINTER (regstart[num]); \
1415 PUSH_FAILURE_POINTER (regend[num]); \
1416 PUSH_FAILURE_INT (num); \
1417 } while (0)
1418
1419 /* Pop a saved register off the stack. */
1420 #define POP_FAILURE_REG() \
1421 do { \
1422 int reg = POP_FAILURE_INT (); \
1423 regend[reg] = POP_FAILURE_POINTER (); \
1424 regstart[reg] = POP_FAILURE_POINTER (); \
1425 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1426 reg, regstart[reg], regend[reg]); \
1427 } while (0)
1428
1429 /* Check that we are not stuck in an infinite loop. */
1430 #define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1431 do { \
1432 int failure = TOP_FAILURE_HANDLE(); \
1433 /* Check for infinite matching loops */ \
1434 while (failure > 0 && \
1435 (FAILURE_STR (failure) == string_place \
1436 || FAILURE_STR (failure) == NULL)) \
1437 { \
1438 assert (FAILURE_PAT (failure) >= bufp->buffer \
1439 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
1440 if (FAILURE_PAT (failure) == pat_cur) \
1441 goto fail; \
1442 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
1443 failure = NEXT_FAILURE_HANDLE(failure); \
1444 } \
1445 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1446 } while (0)
1447
1448 /* Push the information about the state we will need
1449 if we ever fail back to it.
1450
1451 Requires variables fail_stack, regstart, regend and
1452 num_regs be declared. GROW_FAIL_STACK requires `destination' be
1453 declared.
1454
1455 Does `return FAILURE_CODE' if runs out of memory. */
1456
1457 #define PUSH_FAILURE_POINT(pattern, string_place) \
1458 do { \
1459 char *destination; \
1460 /* Must be int, so when we don't save any registers, the arithmetic \
1461 of 0 + -1 isn't done as unsigned. */ \
1462 \
1463 DEBUG_STATEMENT (failure_id++); \
1464 DEBUG_STATEMENT (nfailure_points_pushed++); \
1465 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1466 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1467 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1468 \
1469 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1470 \
1471 DEBUG_PRINT1 ("\n"); \
1472 \
1473 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1474 PUSH_FAILURE_INT (fail_stack.frame); \
1475 \
1476 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1477 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1478 DEBUG_PRINT1 ("'\n"); \
1479 PUSH_FAILURE_POINTER (string_place); \
1480 \
1481 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1482 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1483 PUSH_FAILURE_POINTER (pattern); \
1484 \
1485 /* Close the frame by moving the frame pointer past it. */ \
1486 fail_stack.frame = fail_stack.avail; \
1487 } while (0)
1488
1489 /* Estimate the size of data pushed by a typical failure stack entry.
1490 An estimate is all we need, because all we use this for
1491 is to choose a limit for how big to make the failure stack. */
1492
1493 #define TYPICAL_FAILURE_SIZE 20
1494
1495 /* How many items can still be added to the stack without overflowing it. */
1496 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1497
1498
1499 /* Pops what PUSH_FAIL_STACK pushes.
1500
1501 We restore into the parameters, all of which should be lvalues:
1502 STR -- the saved data position.
1503 PAT -- the saved pattern position.
1504 REGSTART, REGEND -- arrays of string positions.
1505
1506 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1507 `pend', `string1', `size1', `string2', and `size2'. */
1508
1509 #define POP_FAILURE_POINT(str, pat) \
1510 do { \
1511 assert (!FAIL_STACK_EMPTY ()); \
1512 \
1513 /* Remove failure points and point to how many regs pushed. */ \
1514 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1515 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1516 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1517 \
1518 /* Pop the saved registers. */ \
1519 while (fail_stack.frame < fail_stack.avail) \
1520 POP_FAILURE_REG (); \
1521 \
1522 pat = (unsigned char *) POP_FAILURE_POINTER (); \
1523 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1524 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1525 \
1526 /* If the saved string location is NULL, it came from an \
1527 on_failure_keep_string_jump opcode, and we want to throw away the \
1528 saved NULL, thus retaining our current position in the string. */ \
1529 str = (re_char *) POP_FAILURE_POINTER (); \
1530 DEBUG_PRINT2 (" Popping string %p: `", str); \
1531 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1532 DEBUG_PRINT1 ("'\n"); \
1533 \
1534 fail_stack.frame = POP_FAILURE_INT (); \
1535 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
1536 \
1537 assert (fail_stack.avail >= 0); \
1538 assert (fail_stack.frame <= fail_stack.avail); \
1539 \
1540 DEBUG_STATEMENT (nfailure_points_popped++); \
1541 } while (0) /* POP_FAILURE_POINT */
1542
1543
1544 \f
1545 /* Registers are set to a sentinel when they haven't yet matched. */
1546 #define REG_UNSET_VALUE NULL
1547 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1548 \f
1549 /* Subroutine declarations and macros for regex_compile. */
1550
1551 static void store_op1 _RE_ARGS((re_opcode_t op, unsigned char *loc, int arg));
1552 static void store_op2 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1553 int arg1, int arg2));
1554 static void insert_op1 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1555 int arg, unsigned char *end));
1556 static void insert_op2 _RE_ARGS((re_opcode_t op, unsigned char *loc,
1557 int arg1, int arg2, unsigned char *end));
1558 static boolean at_begline_loc_p _RE_ARGS((const unsigned char *pattern,
1559 const unsigned char *p,
1560 reg_syntax_t syntax));
1561 static boolean at_endline_loc_p _RE_ARGS((const unsigned char *p,
1562 const unsigned char *pend,
1563 reg_syntax_t syntax));
1564 static unsigned char *skip_one_char _RE_ARGS((unsigned char *p));
1565 static int analyse_first _RE_ARGS((unsigned char *p, unsigned char *pend,
1566 char *fastmap, const int multibyte));
1567
1568 /* Fetch the next character in the uncompiled pattern---translating it
1569 if necessary. Also cast from a signed character in the constant
1570 string passed to us by the user to an unsigned char that we can use
1571 as an array index (in, e.g., `translate'). */
1572 #define PATFETCH(c) \
1573 do { \
1574 PATFETCH_RAW (c); \
1575 c = TRANSLATE (c); \
1576 } while (0)
1577
1578 /* Fetch the next character in the uncompiled pattern, with no
1579 translation. */
1580 #define PATFETCH_RAW(c) \
1581 do { \
1582 int len; \
1583 if (p == pend) return REG_EEND; \
1584 c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \
1585 p += len; \
1586 } while (0)
1587
1588
1589 /* If `translate' is non-null, return translate[D], else just D. We
1590 cast the subscript to translate because some data is declared as
1591 `char *', to avoid warnings when a string constant is passed. But
1592 when we use a character as a subscript we must make it unsigned. */
1593 #ifndef TRANSLATE
1594 #define TRANSLATE(d) \
1595 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
1596 #endif
1597
1598
1599 /* Macros for outputting the compiled pattern into `buffer'. */
1600
1601 /* If the buffer isn't allocated when it comes in, use this. */
1602 #define INIT_BUF_SIZE 32
1603
1604 /* Make sure we have at least N more bytes of space in buffer. */
1605 #define GET_BUFFER_SPACE(n) \
1606 while (b - bufp->buffer + (n) > bufp->allocated) \
1607 EXTEND_BUFFER ()
1608
1609 /* Make sure we have one more byte of buffer space and then add C to it. */
1610 #define BUF_PUSH(c) \
1611 do { \
1612 GET_BUFFER_SPACE (1); \
1613 *b++ = (unsigned char) (c); \
1614 } while (0)
1615
1616
1617 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1618 #define BUF_PUSH_2(c1, c2) \
1619 do { \
1620 GET_BUFFER_SPACE (2); \
1621 *b++ = (unsigned char) (c1); \
1622 *b++ = (unsigned char) (c2); \
1623 } while (0)
1624
1625
1626 /* As with BUF_PUSH_2, except for three bytes. */
1627 #define BUF_PUSH_3(c1, c2, c3) \
1628 do { \
1629 GET_BUFFER_SPACE (3); \
1630 *b++ = (unsigned char) (c1); \
1631 *b++ = (unsigned char) (c2); \
1632 *b++ = (unsigned char) (c3); \
1633 } while (0)
1634
1635
1636 /* Store a jump with opcode OP at LOC to location TO. We store a
1637 relative address offset by the three bytes the jump itself occupies. */
1638 #define STORE_JUMP(op, loc, to) \
1639 store_op1 (op, loc, (to) - (loc) - 3)
1640
1641 /* Likewise, for a two-argument jump. */
1642 #define STORE_JUMP2(op, loc, to, arg) \
1643 store_op2 (op, loc, (to) - (loc) - 3, arg)
1644
1645 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1646 #define INSERT_JUMP(op, loc, to) \
1647 insert_op1 (op, loc, (to) - (loc) - 3, b)
1648
1649 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1650 #define INSERT_JUMP2(op, loc, to, arg) \
1651 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1652
1653
1654 /* This is not an arbitrary limit: the arguments which represent offsets
1655 into the pattern are two bytes long. So if 2^16 bytes turns out to
1656 be too small, many things would have to change. */
1657 #define MAX_BUF_SIZE (1L << 16)
1658
1659
1660 /* Extend the buffer by twice its current size via realloc and
1661 reset the pointers that pointed into the old block to point to the
1662 correct places in the new one. If extending the buffer results in it
1663 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1664 #define EXTEND_BUFFER() \
1665 do { \
1666 unsigned char *old_buffer = bufp->buffer; \
1667 if (bufp->allocated == MAX_BUF_SIZE) \
1668 return REG_ESIZE; \
1669 bufp->allocated <<= 1; \
1670 if (bufp->allocated > MAX_BUF_SIZE) \
1671 bufp->allocated = MAX_BUF_SIZE; \
1672 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
1673 if (bufp->buffer == NULL) \
1674 return REG_ESPACE; \
1675 /* If the buffer moved, move all the pointers into it. */ \
1676 if (old_buffer != bufp->buffer) \
1677 { \
1678 b = (b - old_buffer) + bufp->buffer; \
1679 begalt = (begalt - old_buffer) + bufp->buffer; \
1680 if (fixup_alt_jump) \
1681 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
1682 if (laststart) \
1683 laststart = (laststart - old_buffer) + bufp->buffer; \
1684 if (pending_exact) \
1685 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
1686 } \
1687 } while (0)
1688
1689
1690 /* Since we have one byte reserved for the register number argument to
1691 {start,stop}_memory, the maximum number of groups we can report
1692 things about is what fits in that byte. */
1693 #define MAX_REGNUM 255
1694
1695 /* But patterns can have more than `MAX_REGNUM' registers. We just
1696 ignore the excess. */
1697 typedef unsigned regnum_t;
1698
1699
1700 /* Macros for the compile stack. */
1701
1702 /* Since offsets can go either forwards or backwards, this type needs to
1703 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1704 typedef int pattern_offset_t;
1705
1706 typedef struct
1707 {
1708 pattern_offset_t begalt_offset;
1709 pattern_offset_t fixup_alt_jump;
1710 pattern_offset_t laststart_offset;
1711 regnum_t regnum;
1712 } compile_stack_elt_t;
1713
1714
1715 typedef struct
1716 {
1717 compile_stack_elt_t *stack;
1718 unsigned size;
1719 unsigned avail; /* Offset of next open position. */
1720 } compile_stack_type;
1721
1722
1723 #define INIT_COMPILE_STACK_SIZE 32
1724
1725 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1726 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1727
1728 /* The next available element. */
1729 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1730
1731
1732 /* Structure to manage work area for range table. */
1733 struct range_table_work_area
1734 {
1735 int *table; /* actual work area. */
1736 int allocated; /* allocated size for work area in bytes. */
1737 int used; /* actually used size in words. */
1738 int bits; /* flag to record character classes */
1739 };
1740
1741 /* Make sure that WORK_AREA can hold more N multibyte characters. */
1742 #define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n) \
1743 do { \
1744 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1745 { \
1746 (work_area).allocated += 16 * sizeof (int); \
1747 if ((work_area).table) \
1748 (work_area).table \
1749 = (int *) realloc ((work_area).table, (work_area).allocated); \
1750 else \
1751 (work_area).table \
1752 = (int *) malloc ((work_area).allocated); \
1753 if ((work_area).table == 0) \
1754 FREE_STACK_RETURN (REG_ESPACE); \
1755 } \
1756 } while (0)
1757
1758 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1759 (work_area).bits |= (bit)
1760
1761 /* These bits represent the various character classes such as [:alnum:]
1762 in a charset's range table. */
1763 #define BIT_ALNUM 0x1
1764 #define BIT_ALPHA 0x2
1765 #define BIT_WORD 0x4
1766 #define BIT_ASCII 0x8
1767 #define BIT_NONASCII 0x10
1768 #define BIT_GRAPH 0x20
1769 #define BIT_LOWER 0x40
1770 #define BIT_PRINT 0x80
1771 #define BIT_PUNCT 0x100
1772 #define BIT_SPACE 0x200
1773 #define BIT_UPPER 0x400
1774 #define BIT_UNIBYTE 0x800
1775 #define BIT_MULTIBYTE 0x1000
1776
1777 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1778 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1779 do { \
1780 EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2); \
1781 (work_area).table[(work_area).used++] = (range_start); \
1782 (work_area).table[(work_area).used++] = (range_end); \
1783 } while (0)
1784
1785 /* Free allocated memory for WORK_AREA. */
1786 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1787 do { \
1788 if ((work_area).table) \
1789 free ((work_area).table); \
1790 } while (0)
1791
1792 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1793 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1794 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1795 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1796
1797
1798 /* Set the bit for character C in a list. */
1799 #define SET_LIST_BIT(c) \
1800 (b[((unsigned char) (c)) / BYTEWIDTH] \
1801 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1802
1803
1804 /* Get the next unsigned number in the uncompiled pattern. */
1805 #define GET_UNSIGNED_NUMBER(num) \
1806 do { if (p != pend) \
1807 { \
1808 PATFETCH (c); \
1809 while (ISDIGIT (c)) \
1810 { \
1811 if (num < 0) \
1812 num = 0; \
1813 num = num * 10 + c - '0'; \
1814 if (p == pend) \
1815 break; \
1816 PATFETCH (c); \
1817 } \
1818 } \
1819 } while (0)
1820
1821 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1822
1823 #define IS_CHAR_CLASS(string) \
1824 (STREQ (string, "alpha") || STREQ (string, "upper") \
1825 || STREQ (string, "lower") || STREQ (string, "digit") \
1826 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1827 || STREQ (string, "space") || STREQ (string, "print") \
1828 || STREQ (string, "punct") || STREQ (string, "graph") \
1829 || STREQ (string, "cntrl") || STREQ (string, "blank") \
1830 || STREQ (string, "word") \
1831 || STREQ (string, "ascii") || STREQ (string, "nonascii") \
1832 || STREQ (string, "unibyte") || STREQ (string, "multibyte"))
1833
1834 /* QUIT is only used on NTemacs. */
1835 #if !defined (WINDOWSNT) || !defined (emacs)
1836 #undef QUIT
1837 #define QUIT
1838 #endif
1839 \f
1840 #ifndef MATCH_MAY_ALLOCATE
1841
1842 /* If we cannot allocate large objects within re_match_2_internal,
1843 we make the fail stack and register vectors global.
1844 The fail stack, we grow to the maximum size when a regexp
1845 is compiled.
1846 The register vectors, we adjust in size each time we
1847 compile a regexp, according to the number of registers it needs. */
1848
1849 static fail_stack_type fail_stack;
1850
1851 /* Size with which the following vectors are currently allocated.
1852 That is so we can make them bigger as needed,
1853 but never make them smaller. */
1854 static int regs_allocated_size;
1855
1856 static re_char ** regstart, ** regend;
1857 static re_char **best_regstart, **best_regend;
1858
1859 /* Make the register vectors big enough for NUM_REGS registers,
1860 but don't make them smaller. */
1861
1862 static
1863 regex_grow_registers (num_regs)
1864 int num_regs;
1865 {
1866 if (num_regs > regs_allocated_size)
1867 {
1868 RETALLOC_IF (regstart, num_regs, re_char *);
1869 RETALLOC_IF (regend, num_regs, re_char *);
1870 RETALLOC_IF (best_regstart, num_regs, re_char *);
1871 RETALLOC_IF (best_regend, num_regs, re_char *);
1872
1873 regs_allocated_size = num_regs;
1874 }
1875 }
1876
1877 #endif /* not MATCH_MAY_ALLOCATE */
1878 \f
1879 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
1880 compile_stack,
1881 regnum_t regnum));
1882
1883 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1884 Returns one of error codes defined in `regex.h', or zero for success.
1885
1886 Assumes the `allocated' (and perhaps `buffer') and `translate'
1887 fields are set in BUFP on entry.
1888
1889 If it succeeds, results are put in BUFP (if it returns an error, the
1890 contents of BUFP are undefined):
1891 `buffer' is the compiled pattern;
1892 `syntax' is set to SYNTAX;
1893 `used' is set to the length of the compiled pattern;
1894 `fastmap_accurate' is zero;
1895 `re_nsub' is the number of subexpressions in PATTERN;
1896 `not_bol' and `not_eol' are zero;
1897
1898 The `fastmap' and `newline_anchor' fields are neither
1899 examined nor set. */
1900
1901 /* Insert the `jump' from the end of last alternative to "here".
1902 The space for the jump has already been allocated. */
1903 #define FIXUP_ALT_JUMP() \
1904 do { \
1905 if (fixup_alt_jump) \
1906 STORE_JUMP (jump, fixup_alt_jump, b); \
1907 } while (0)
1908
1909
1910 /* Return, freeing storage we allocated. */
1911 #define FREE_STACK_RETURN(value) \
1912 do { \
1913 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
1914 free (compile_stack.stack); \
1915 return value; \
1916 } while (0)
1917
1918 static reg_errcode_t
1919 regex_compile (pattern, size, syntax, bufp)
1920 re_char *pattern;
1921 int size;
1922 reg_syntax_t syntax;
1923 struct re_pattern_buffer *bufp;
1924 {
1925 /* We fetch characters from PATTERN here. Even though PATTERN is
1926 `char *' (i.e., signed), we declare these variables as unsigned, so
1927 they can be reliably used as array indices. */
1928 register unsigned int c, c1;
1929
1930 /* A random temporary spot in PATTERN. */
1931 re_char *p1;
1932
1933 /* Points to the end of the buffer, where we should append. */
1934 register unsigned char *b;
1935
1936 /* Keeps track of unclosed groups. */
1937 compile_stack_type compile_stack;
1938
1939 /* Points to the current (ending) position in the pattern. */
1940 #ifdef AIX
1941 /* `const' makes AIX compiler fail. */
1942 unsigned char *p = pattern;
1943 #else
1944 re_char *p = pattern;
1945 #endif
1946 re_char *pend = pattern + size;
1947
1948 /* How to translate the characters in the pattern. */
1949 RE_TRANSLATE_TYPE translate = bufp->translate;
1950
1951 /* Address of the count-byte of the most recently inserted `exactn'
1952 command. This makes it possible to tell if a new exact-match
1953 character can be added to that command or if the character requires
1954 a new `exactn' command. */
1955 unsigned char *pending_exact = 0;
1956
1957 /* Address of start of the most recently finished expression.
1958 This tells, e.g., postfix * where to find the start of its
1959 operand. Reset at the beginning of groups and alternatives. */
1960 unsigned char *laststart = 0;
1961
1962 /* Address of beginning of regexp, or inside of last group. */
1963 unsigned char *begalt;
1964
1965 /* Place in the uncompiled pattern (i.e., the {) to
1966 which to go back if the interval is invalid. */
1967 re_char *beg_interval;
1968
1969 /* Address of the place where a forward jump should go to the end of
1970 the containing expression. Each alternative of an `or' -- except the
1971 last -- ends with a forward jump of this sort. */
1972 unsigned char *fixup_alt_jump = 0;
1973
1974 /* Counts open-groups as they are encountered. Remembered for the
1975 matching close-group on the compile stack, so the same register
1976 number is put in the stop_memory as the start_memory. */
1977 regnum_t regnum = 0;
1978
1979 /* Work area for range table of charset. */
1980 struct range_table_work_area range_table_work;
1981
1982 /* If the object matched can contain multibyte characters. */
1983 const boolean multibyte = RE_MULTIBYTE_P (bufp);
1984
1985 #ifdef DEBUG
1986 debug++;
1987 DEBUG_PRINT1 ("\nCompiling pattern: ");
1988 if (debug > 0)
1989 {
1990 unsigned debug_count;
1991
1992 for (debug_count = 0; debug_count < size; debug_count++)
1993 putchar (pattern[debug_count]);
1994 putchar ('\n');
1995 }
1996 #endif /* DEBUG */
1997
1998 /* Initialize the compile stack. */
1999 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2000 if (compile_stack.stack == NULL)
2001 return REG_ESPACE;
2002
2003 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2004 compile_stack.avail = 0;
2005
2006 range_table_work.table = 0;
2007 range_table_work.allocated = 0;
2008
2009 /* Initialize the pattern buffer. */
2010 bufp->syntax = syntax;
2011 bufp->fastmap_accurate = 0;
2012 bufp->not_bol = bufp->not_eol = 0;
2013
2014 /* Set `used' to zero, so that if we return an error, the pattern
2015 printer (for debugging) will think there's no pattern. We reset it
2016 at the end. */
2017 bufp->used = 0;
2018
2019 /* Always count groups, whether or not bufp->no_sub is set. */
2020 bufp->re_nsub = 0;
2021
2022 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2023 /* Initialize the syntax table. */
2024 init_syntax_once ();
2025 #endif
2026
2027 if (bufp->allocated == 0)
2028 {
2029 if (bufp->buffer)
2030 { /* If zero allocated, but buffer is non-null, try to realloc
2031 enough space. This loses if buffer's address is bogus, but
2032 that is the user's responsibility. */
2033 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2034 }
2035 else
2036 { /* Caller did not allocate a buffer. Do it for them. */
2037 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2038 }
2039 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2040
2041 bufp->allocated = INIT_BUF_SIZE;
2042 }
2043
2044 begalt = b = bufp->buffer;
2045
2046 /* Loop through the uncompiled pattern until we're at the end. */
2047 while (p != pend)
2048 {
2049 PATFETCH (c);
2050
2051 switch (c)
2052 {
2053 case '^':
2054 {
2055 if ( /* If at start of pattern, it's an operator. */
2056 p == pattern + 1
2057 /* If context independent, it's an operator. */
2058 || syntax & RE_CONTEXT_INDEP_ANCHORS
2059 /* Otherwise, depends on what's come before. */
2060 || at_begline_loc_p (pattern, p, syntax))
2061 BUF_PUSH (begline);
2062 else
2063 goto normal_char;
2064 }
2065 break;
2066
2067
2068 case '$':
2069 {
2070 if ( /* If at end of pattern, it's an operator. */
2071 p == pend
2072 /* If context independent, it's an operator. */
2073 || syntax & RE_CONTEXT_INDEP_ANCHORS
2074 /* Otherwise, depends on what's next. */
2075 || at_endline_loc_p (p, pend, syntax))
2076 BUF_PUSH (endline);
2077 else
2078 goto normal_char;
2079 }
2080 break;
2081
2082
2083 case '+':
2084 case '?':
2085 if ((syntax & RE_BK_PLUS_QM)
2086 || (syntax & RE_LIMITED_OPS))
2087 goto normal_char;
2088 handle_plus:
2089 case '*':
2090 /* If there is no previous pattern... */
2091 if (!laststart)
2092 {
2093 if (syntax & RE_CONTEXT_INVALID_OPS)
2094 FREE_STACK_RETURN (REG_BADRPT);
2095 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2096 goto normal_char;
2097 }
2098
2099 {
2100 /* 1 means zero (many) matches is allowed. */
2101 boolean zero_times_ok = 0, many_times_ok = 0;
2102 boolean greedy = 1;
2103
2104 /* If there is a sequence of repetition chars, collapse it
2105 down to just one (the right one). We can't combine
2106 interval operators with these because of, e.g., `a{2}*',
2107 which should only match an even number of `a's. */
2108
2109 for (;;)
2110 {
2111 if (!(syntax & RE_ALL_GREEDY)
2112 && c == '?' && (zero_times_ok || many_times_ok))
2113 greedy = 0;
2114 else
2115 {
2116 zero_times_ok |= c != '+';
2117 many_times_ok |= c != '?';
2118 }
2119
2120 if (p == pend)
2121 break;
2122 else if (*p == '*'
2123 || (!(syntax & RE_BK_PLUS_QM)
2124 && (*p == '+' || *p == '?')))
2125 ;
2126 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
2127 {
2128 if (p+1 == pend)
2129 FREE_STACK_RETURN (REG_EESCAPE);
2130 if (p[1] == '+' || p[1] == '?')
2131 PATFETCH (c); /* Gobble up the backslash. */
2132 else
2133 break;
2134 }
2135 else
2136 break;
2137 /* If we get here, we found another repeat character. */
2138 PATFETCH (c);
2139 }
2140
2141 /* Star, etc. applied to an empty pattern is equivalent
2142 to an empty pattern. */
2143 if (!laststart || laststart == b)
2144 break;
2145
2146 /* Now we know whether or not zero matches is allowed
2147 and also whether or not two or more matches is allowed. */
2148 if (greedy)
2149 {
2150 if (many_times_ok)
2151 {
2152 boolean simple = skip_one_char (laststart) == b;
2153 unsigned int startoffset = 0;
2154 re_opcode_t ofj =
2155 (simple || !analyse_first (laststart, b, NULL, 0)) ?
2156 on_failure_jump : on_failure_jump_loop;
2157 assert (skip_one_char (laststart) <= b);
2158
2159 if (!zero_times_ok && simple)
2160 { /* Since simple * loops can be made faster by using
2161 on_failure_keep_string_jump, we turn simple P+
2162 into PP* if P is simple. */
2163 unsigned char *p1, *p2;
2164 startoffset = b - laststart;
2165 GET_BUFFER_SPACE (startoffset);
2166 p1 = b; p2 = laststart;
2167 while (p2 < p1)
2168 *b++ = *p2++;
2169 zero_times_ok = 1;
2170 }
2171
2172 GET_BUFFER_SPACE (6);
2173 if (!zero_times_ok)
2174 /* A + loop. */
2175 STORE_JUMP (ofj, b, b + 6);
2176 else
2177 /* Simple * loops can use on_failure_keep_string_jump
2178 depending on what follows. But since we don't know
2179 that yet, we leave the decision up to
2180 on_failure_jump_smart. */
2181 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
2182 laststart + startoffset, b + 6);
2183 b += 3;
2184 STORE_JUMP (jump, b, laststart + startoffset);
2185 b += 3;
2186 }
2187 else
2188 {
2189 /* A simple ? pattern. */
2190 assert (zero_times_ok);
2191 GET_BUFFER_SPACE (3);
2192 INSERT_JUMP (on_failure_jump, laststart, b + 3);
2193 b += 3;
2194 }
2195 }
2196 else /* not greedy */
2197 { /* I wish the greedy and non-greedy cases could be merged. */
2198
2199 GET_BUFFER_SPACE (7); /* We might use less. */
2200 if (many_times_ok)
2201 {
2202 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2203
2204 /* The non-greedy multiple match looks like a repeat..until:
2205 we only need a conditional jump at the end of the loop */
2206 if (emptyp) BUF_PUSH (no_op);
2207 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2208 : on_failure_jump, b, laststart);
2209 b += 3;
2210 if (zero_times_ok)
2211 {
2212 /* The repeat...until naturally matches one or more.
2213 To also match zero times, we need to first jump to
2214 the end of the loop (its conditional jump). */
2215 INSERT_JUMP (jump, laststart, b);
2216 b += 3;
2217 }
2218 }
2219 else
2220 {
2221 /* non-greedy a?? */
2222 INSERT_JUMP (jump, laststart, b + 3);
2223 b += 3;
2224 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2225 b += 3;
2226 }
2227 }
2228 }
2229 pending_exact = 0;
2230 break;
2231
2232
2233 case '.':
2234 laststart = b;
2235 BUF_PUSH (anychar);
2236 break;
2237
2238
2239 case '[':
2240 {
2241 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2242
2243 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2244
2245 /* Ensure that we have enough space to push a charset: the
2246 opcode, the length count, and the bitset; 34 bytes in all. */
2247 GET_BUFFER_SPACE (34);
2248
2249 laststart = b;
2250
2251 /* We test `*p == '^' twice, instead of using an if
2252 statement, so we only need one BUF_PUSH. */
2253 BUF_PUSH (*p == '^' ? charset_not : charset);
2254 if (*p == '^')
2255 p++;
2256
2257 /* Remember the first position in the bracket expression. */
2258 p1 = p;
2259
2260 /* Push the number of bytes in the bitmap. */
2261 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2262
2263 /* Clear the whole map. */
2264 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
2265
2266 /* charset_not matches newline according to a syntax bit. */
2267 if ((re_opcode_t) b[-2] == charset_not
2268 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2269 SET_LIST_BIT ('\n');
2270
2271 /* Read in characters and ranges, setting map bits. */
2272 for (;;)
2273 {
2274 boolean escaped_char = false;
2275 const unsigned char *p2 = p;
2276
2277 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2278
2279 PATFETCH (c);
2280
2281 /* \ might escape characters inside [...] and [^...]. */
2282 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2283 {
2284 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2285
2286 PATFETCH (c);
2287 escaped_char = true;
2288 }
2289 else
2290 {
2291 /* Could be the end of the bracket expression. If it's
2292 not (i.e., when the bracket expression is `[]' so
2293 far), the ']' character bit gets set way below. */
2294 if (c == ']' && p2 != p1)
2295 break;
2296 }
2297
2298 /* What should we do for the character which is
2299 greater than 0x7F, but not BASE_LEADING_CODE_P?
2300 XXX */
2301
2302 /* See if we're at the beginning of a possible character
2303 class. */
2304
2305 if (!escaped_char &&
2306 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2307 {
2308 /* Leave room for the null. */
2309 char str[CHAR_CLASS_MAX_LENGTH + 1];
2310 const unsigned char *class_beg;
2311
2312 PATFETCH (c);
2313 c1 = 0;
2314 class_beg = p;
2315
2316 /* If pattern is `[[:'. */
2317 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2318
2319 for (;;)
2320 {
2321 PATFETCH (c);
2322 if (c == ':' || c == ']' || p == pend
2323 || c1 == CHAR_CLASS_MAX_LENGTH)
2324 break;
2325 str[c1++] = c;
2326 }
2327 str[c1] = '\0';
2328
2329 /* If isn't a word bracketed by `[:' and `:]':
2330 undo the ending character, the letters, and
2331 leave the leading `:' and `[' (but set bits for
2332 them). */
2333 if (c == ':' && *p == ']')
2334 {
2335 int ch;
2336 boolean is_alnum = STREQ (str, "alnum");
2337 boolean is_alpha = STREQ (str, "alpha");
2338 boolean is_ascii = STREQ (str, "ascii");
2339 boolean is_blank = STREQ (str, "blank");
2340 boolean is_cntrl = STREQ (str, "cntrl");
2341 boolean is_digit = STREQ (str, "digit");
2342 boolean is_graph = STREQ (str, "graph");
2343 boolean is_lower = STREQ (str, "lower");
2344 boolean is_multibyte = STREQ (str, "multibyte");
2345 boolean is_nonascii = STREQ (str, "nonascii");
2346 boolean is_print = STREQ (str, "print");
2347 boolean is_punct = STREQ (str, "punct");
2348 boolean is_space = STREQ (str, "space");
2349 boolean is_unibyte = STREQ (str, "unibyte");
2350 boolean is_upper = STREQ (str, "upper");
2351 boolean is_word = STREQ (str, "word");
2352 boolean is_xdigit = STREQ (str, "xdigit");
2353
2354 if (!IS_CHAR_CLASS (str))
2355 FREE_STACK_RETURN (REG_ECTYPE);
2356
2357 /* Throw away the ] at the end of the character
2358 class. */
2359 PATFETCH (c);
2360
2361 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2362
2363 /* Most character classes in a multibyte match
2364 just set a flag. Exceptions are is_blank,
2365 is_digit, is_cntrl, and is_xdigit, since
2366 they can only match ASCII characters. We
2367 don't need to handle them for multibyte. */
2368
2369 if (multibyte)
2370 {
2371 int bit = 0;
2372
2373 if (is_alnum) bit = BIT_ALNUM;
2374 if (is_alpha) bit = BIT_ALPHA;
2375 if (is_ascii) bit = BIT_ASCII;
2376 if (is_graph) bit = BIT_GRAPH;
2377 if (is_lower) bit = BIT_LOWER;
2378 if (is_multibyte) bit = BIT_MULTIBYTE;
2379 if (is_nonascii) bit = BIT_NONASCII;
2380 if (is_print) bit = BIT_PRINT;
2381 if (is_punct) bit = BIT_PUNCT;
2382 if (is_space) bit = BIT_SPACE;
2383 if (is_unibyte) bit = BIT_UNIBYTE;
2384 if (is_upper) bit = BIT_UPPER;
2385 if (is_word) bit = BIT_WORD;
2386 if (bit)
2387 SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work,
2388 bit);
2389 }
2390
2391 /* Handle character classes for ASCII characters. */
2392 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
2393 {
2394 int translated = TRANSLATE (ch);
2395 /* This was split into 3 if's to
2396 avoid an arbitrary limit in some compiler. */
2397 if ( (is_alnum && ISALNUM (ch))
2398 || (is_alpha && ISALPHA (ch))
2399 || (is_blank && ISBLANK (ch))
2400 || (is_cntrl && ISCNTRL (ch)))
2401 SET_LIST_BIT (translated);
2402 if ( (is_digit && ISDIGIT (ch))
2403 || (is_graph && ISGRAPH (ch))
2404 || (is_lower && ISLOWER (ch))
2405 || (is_print && ISPRINT (ch)))
2406 SET_LIST_BIT (translated);
2407 if ( (is_punct && ISPUNCT (ch))
2408 || (is_space && ISSPACE (ch))
2409 || (is_upper && ISUPPER (ch))
2410 || (is_xdigit && ISXDIGIT (ch)))
2411 SET_LIST_BIT (translated);
2412 if ( (is_ascii && IS_REAL_ASCII (ch))
2413 || (is_nonascii && !IS_REAL_ASCII (ch))
2414 || (is_unibyte && ISUNIBYTE (ch))
2415 || (is_multibyte && !ISUNIBYTE (ch)))
2416 SET_LIST_BIT (translated);
2417
2418 if ( (is_word && ISWORD (ch)))
2419 SET_LIST_BIT (translated);
2420 }
2421
2422 /* Repeat the loop. */
2423 continue;
2424 }
2425 else
2426 {
2427 /* Go back to right after the "[:". */
2428 p = class_beg;
2429 SET_LIST_BIT ('[');
2430
2431 /* Because the `:' may starts the range, we
2432 can't simply set bit and repeat the loop.
2433 Instead, just set it to C and handle below. */
2434 c = ':';
2435 }
2436 }
2437
2438 if (p < pend && p[0] == '-' && p[1] != ']')
2439 {
2440
2441 /* Discard the `-'. */
2442 PATFETCH (c1);
2443
2444 /* Fetch the character which ends the range. */
2445 PATFETCH (c1);
2446
2447 if (SINGLE_BYTE_CHAR_P (c))
2448 {
2449 if (! SINGLE_BYTE_CHAR_P (c1))
2450 {
2451 /* Handle a range such as \177-\377 in
2452 multibyte mode. Split that into two
2453 ranges, the low one ending at 0237, and
2454 the high one starting at the smallest
2455 character in the charset of C1 and
2456 ending at C1. */
2457 int charset = CHAR_CHARSET (c1);
2458 int c2 = MAKE_CHAR (charset, 0, 0);
2459
2460 SET_RANGE_TABLE_WORK_AREA (range_table_work,
2461 c2, c1);
2462 c1 = 0237;
2463 }
2464 }
2465 else if (!SAME_CHARSET_P (c, c1))
2466 FREE_STACK_RETURN (REG_ERANGE);
2467 }
2468 else
2469 /* Range from C to C. */
2470 c1 = c;
2471
2472 /* Set the range ... */
2473 if (SINGLE_BYTE_CHAR_P (c))
2474 /* ... into bitmap. */
2475 {
2476 unsigned this_char;
2477 int range_start = c, range_end = c1;
2478
2479 /* If the start is after the end, the range is empty. */
2480 if (range_start > range_end)
2481 {
2482 if (syntax & RE_NO_EMPTY_RANGES)
2483 FREE_STACK_RETURN (REG_ERANGE);
2484 /* Else, repeat the loop. */
2485 }
2486 else
2487 {
2488 for (this_char = range_start; this_char <= range_end;
2489 this_char++)
2490 SET_LIST_BIT (TRANSLATE (this_char));
2491 }
2492 }
2493 else
2494 /* ... into range table. */
2495 SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1);
2496 }
2497
2498 /* Discard any (non)matching list bytes that are all 0 at the
2499 end of the map. Decrease the map-length byte too. */
2500 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
2501 b[-1]--;
2502 b += b[-1];
2503
2504 /* Build real range table from work area. */
2505 if (RANGE_TABLE_WORK_USED (range_table_work)
2506 || RANGE_TABLE_WORK_BITS (range_table_work))
2507 {
2508 int i;
2509 int used = RANGE_TABLE_WORK_USED (range_table_work);
2510
2511 /* Allocate space for COUNT + RANGE_TABLE. Needs two
2512 bytes for flags, two for COUNT, and three bytes for
2513 each character. */
2514 GET_BUFFER_SPACE (4 + used * 3);
2515
2516 /* Indicate the existence of range table. */
2517 laststart[1] |= 0x80;
2518
2519 /* Store the character class flag bits into the range table.
2520 If not in emacs, these flag bits are always 0. */
2521 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
2522 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
2523
2524 STORE_NUMBER_AND_INCR (b, used / 2);
2525 for (i = 0; i < used; i++)
2526 STORE_CHARACTER_AND_INCR
2527 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
2528 }
2529 }
2530 break;
2531
2532
2533 case '(':
2534 if (syntax & RE_NO_BK_PARENS)
2535 goto handle_open;
2536 else
2537 goto normal_char;
2538
2539
2540 case ')':
2541 if (syntax & RE_NO_BK_PARENS)
2542 goto handle_close;
2543 else
2544 goto normal_char;
2545
2546
2547 case '\n':
2548 if (syntax & RE_NEWLINE_ALT)
2549 goto handle_alt;
2550 else
2551 goto normal_char;
2552
2553
2554 case '|':
2555 if (syntax & RE_NO_BK_VBAR)
2556 goto handle_alt;
2557 else
2558 goto normal_char;
2559
2560
2561 case '{':
2562 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2563 goto handle_interval;
2564 else
2565 goto normal_char;
2566
2567
2568 case '\\':
2569 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2570
2571 /* Do not translate the character after the \, so that we can
2572 distinguish, e.g., \B from \b, even if we normally would
2573 translate, e.g., B to b. */
2574 PATFETCH_RAW (c);
2575
2576 switch (c)
2577 {
2578 case '(':
2579 if (syntax & RE_NO_BK_PARENS)
2580 goto normal_backslash;
2581
2582 handle_open:
2583 {
2584 int shy = 0;
2585 if (p+1 < pend)
2586 {
2587 /* Look for a special (?...) construct */
2588 if ((syntax & RE_SHY_GROUPS) && *p == '?')
2589 {
2590 PATFETCH (c); /* Gobble up the '?'. */
2591 PATFETCH (c);
2592 switch (c)
2593 {
2594 case ':': shy = 1; break;
2595 default:
2596 /* Only (?:...) is supported right now. */
2597 FREE_STACK_RETURN (REG_BADPAT);
2598 }
2599 }
2600 }
2601
2602 if (!shy)
2603 {
2604 bufp->re_nsub++;
2605 regnum++;
2606 }
2607
2608 if (COMPILE_STACK_FULL)
2609 {
2610 RETALLOC (compile_stack.stack, compile_stack.size << 1,
2611 compile_stack_elt_t);
2612 if (compile_stack.stack == NULL) return REG_ESPACE;
2613
2614 compile_stack.size <<= 1;
2615 }
2616
2617 /* These are the values to restore when we hit end of this
2618 group. They are all relative offsets, so that if the
2619 whole pattern moves because of realloc, they will still
2620 be valid. */
2621 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
2622 COMPILE_STACK_TOP.fixup_alt_jump
2623 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
2624 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
2625 COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
2626
2627 /* Do not push a
2628 start_memory for groups beyond the last one we can
2629 represent in the compiled pattern. */
2630 if (regnum <= MAX_REGNUM && !shy)
2631 BUF_PUSH_2 (start_memory, regnum);
2632
2633 compile_stack.avail++;
2634
2635 fixup_alt_jump = 0;
2636 laststart = 0;
2637 begalt = b;
2638 /* If we've reached MAX_REGNUM groups, then this open
2639 won't actually generate any code, so we'll have to
2640 clear pending_exact explicitly. */
2641 pending_exact = 0;
2642 break;
2643 }
2644
2645 case ')':
2646 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
2647
2648 if (COMPILE_STACK_EMPTY)
2649 {
2650 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2651 goto normal_backslash;
2652 else
2653 FREE_STACK_RETURN (REG_ERPAREN);
2654 }
2655
2656 handle_close:
2657 FIXUP_ALT_JUMP ();
2658
2659 /* See similar code for backslashed left paren above. */
2660 if (COMPILE_STACK_EMPTY)
2661 {
2662 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
2663 goto normal_char;
2664 else
2665 FREE_STACK_RETURN (REG_ERPAREN);
2666 }
2667
2668 /* Since we just checked for an empty stack above, this
2669 ``can't happen''. */
2670 assert (compile_stack.avail != 0);
2671 {
2672 /* We don't just want to restore into `regnum', because
2673 later groups should continue to be numbered higher,
2674 as in `(ab)c(de)' -- the second group is #2. */
2675 regnum_t this_group_regnum;
2676
2677 compile_stack.avail--;
2678 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
2679 fixup_alt_jump
2680 = COMPILE_STACK_TOP.fixup_alt_jump
2681 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
2682 : 0;
2683 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
2684 this_group_regnum = COMPILE_STACK_TOP.regnum;
2685 /* If we've reached MAX_REGNUM groups, then this open
2686 won't actually generate any code, so we'll have to
2687 clear pending_exact explicitly. */
2688 pending_exact = 0;
2689
2690 /* We're at the end of the group, so now we know how many
2691 groups were inside this one. */
2692 if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
2693 BUF_PUSH_2 (stop_memory, this_group_regnum);
2694 }
2695 break;
2696
2697
2698 case '|': /* `\|'. */
2699 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
2700 goto normal_backslash;
2701 handle_alt:
2702 if (syntax & RE_LIMITED_OPS)
2703 goto normal_char;
2704
2705 /* Insert before the previous alternative a jump which
2706 jumps to this alternative if the former fails. */
2707 GET_BUFFER_SPACE (3);
2708 INSERT_JUMP (on_failure_jump, begalt, b + 6);
2709 pending_exact = 0;
2710 b += 3;
2711
2712 /* The alternative before this one has a jump after it
2713 which gets executed if it gets matched. Adjust that
2714 jump so it will jump to this alternative's analogous
2715 jump (put in below, which in turn will jump to the next
2716 (if any) alternative's such jump, etc.). The last such
2717 jump jumps to the correct final destination. A picture:
2718 _____ _____
2719 | | | |
2720 | v | v
2721 a | b | c
2722
2723 If we are at `b', then fixup_alt_jump right now points to a
2724 three-byte space after `a'. We'll put in the jump, set
2725 fixup_alt_jump to right after `b', and leave behind three
2726 bytes which we'll fill in when we get to after `c'. */
2727
2728 FIXUP_ALT_JUMP ();
2729
2730 /* Mark and leave space for a jump after this alternative,
2731 to be filled in later either by next alternative or
2732 when know we're at the end of a series of alternatives. */
2733 fixup_alt_jump = b;
2734 GET_BUFFER_SPACE (3);
2735 b += 3;
2736
2737 laststart = 0;
2738 begalt = b;
2739 break;
2740
2741
2742 case '{':
2743 /* If \{ is a literal. */
2744 if (!(syntax & RE_INTERVALS)
2745 /* If we're at `\{' and it's not the open-interval
2746 operator. */
2747 || (syntax & RE_NO_BK_BRACES)
2748 /* What is that? -sm */
2749 /* || (p - 2 == pattern && p == pend) */)
2750 goto normal_backslash;
2751
2752 handle_interval:
2753 {
2754 /* If got here, then the syntax allows intervals. */
2755
2756 /* At least (most) this many matches must be made. */
2757 int lower_bound = 0, upper_bound = -1;
2758
2759 beg_interval = p;
2760
2761 if (p == pend)
2762 {
2763 if (syntax & RE_NO_BK_BRACES)
2764 goto unfetch_interval;
2765 else
2766 FREE_STACK_RETURN (REG_EBRACE);
2767 }
2768
2769 GET_UNSIGNED_NUMBER (lower_bound);
2770
2771 if (c == ',')
2772 GET_UNSIGNED_NUMBER (upper_bound);
2773 else
2774 /* Interval such as `{1}' => match exactly once. */
2775 upper_bound = lower_bound;
2776
2777 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
2778 || (upper_bound >= 0 && lower_bound > upper_bound))
2779 {
2780 if (syntax & RE_NO_BK_BRACES)
2781 goto unfetch_interval;
2782 else
2783 FREE_STACK_RETURN (REG_BADBR);
2784 }
2785
2786 if (!(syntax & RE_NO_BK_BRACES))
2787 {
2788 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE);
2789
2790 PATFETCH (c);
2791 }
2792
2793 if (c != '}')
2794 {
2795 if (syntax & RE_NO_BK_BRACES)
2796 goto unfetch_interval;
2797 else
2798 FREE_STACK_RETURN (REG_BADBR);
2799 }
2800
2801 /* We just parsed a valid interval. */
2802
2803 /* If it's invalid to have no preceding re. */
2804 if (!laststart)
2805 {
2806 if (syntax & RE_CONTEXT_INVALID_OPS)
2807 FREE_STACK_RETURN (REG_BADRPT);
2808 else if (syntax & RE_CONTEXT_INDEP_OPS)
2809 laststart = b;
2810 else
2811 goto unfetch_interval;
2812 }
2813
2814 if (upper_bound == 0)
2815 /* If the upper bound is zero, just drop the sub pattern
2816 altogether. */
2817 b = laststart;
2818 else if (lower_bound == 1 && upper_bound == 1)
2819 /* Just match it once: nothing to do here. */
2820 ;
2821
2822 /* Otherwise, we have a nontrivial interval. When
2823 we're all done, the pattern will look like:
2824 set_number_at <jump count> <upper bound>
2825 set_number_at <succeed_n count> <lower bound>
2826 succeed_n <after jump addr> <succeed_n count>
2827 <body of loop>
2828 jump_n <succeed_n addr> <jump count>
2829 (The upper bound and `jump_n' are omitted if
2830 `upper_bound' is 1, though.) */
2831 else
2832 { /* If the upper bound is > 1, we need to insert
2833 more at the end of the loop. */
2834 unsigned int nbytes = (upper_bound < 0 ? 3
2835 : upper_bound > 1 ? 5 : 0);
2836 unsigned int startoffset = 0;
2837
2838 GET_BUFFER_SPACE (20); /* We might use less. */
2839
2840 if (lower_bound == 0)
2841 {
2842 /* A succeed_n that starts with 0 is really a
2843 a simple on_failure_jump_loop. */
2844 INSERT_JUMP (on_failure_jump_loop, laststart,
2845 b + 3 + nbytes);
2846 b += 3;
2847 }
2848 else
2849 {
2850 /* Initialize lower bound of the `succeed_n', even
2851 though it will be set during matching by its
2852 attendant `set_number_at' (inserted next),
2853 because `re_compile_fastmap' needs to know.
2854 Jump to the `jump_n' we might insert below. */
2855 INSERT_JUMP2 (succeed_n, laststart,
2856 b + 5 + nbytes,
2857 lower_bound);
2858 b += 5;
2859
2860 /* Code to initialize the lower bound. Insert
2861 before the `succeed_n'. The `5' is the last two
2862 bytes of this `set_number_at', plus 3 bytes of
2863 the following `succeed_n'. */
2864 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
2865 b += 5;
2866 startoffset += 5;
2867 }
2868
2869 if (upper_bound < 0)
2870 {
2871 /* A negative upper bound stands for infinity,
2872 in which case it degenerates to a plain jump. */
2873 STORE_JUMP (jump, b, laststart + startoffset);
2874 b += 3;
2875 }
2876 else if (upper_bound > 1)
2877 { /* More than one repetition is allowed, so
2878 append a backward jump to the `succeed_n'
2879 that starts this interval.
2880
2881 When we've reached this during matching,
2882 we'll have matched the interval once, so
2883 jump back only `upper_bound - 1' times. */
2884 STORE_JUMP2 (jump_n, b, laststart + startoffset,
2885 upper_bound - 1);
2886 b += 5;
2887
2888 /* The location we want to set is the second
2889 parameter of the `jump_n'; that is `b-2' as
2890 an absolute address. `laststart' will be
2891 the `set_number_at' we're about to insert;
2892 `laststart+3' the number to set, the source
2893 for the relative address. But we are
2894 inserting into the middle of the pattern --
2895 so everything is getting moved up by 5.
2896 Conclusion: (b - 2) - (laststart + 3) + 5,
2897 i.e., b - laststart.
2898
2899 We insert this at the beginning of the loop
2900 so that if we fail during matching, we'll
2901 reinitialize the bounds. */
2902 insert_op2 (set_number_at, laststart, b - laststart,
2903 upper_bound - 1, b);
2904 b += 5;
2905 }
2906 }
2907 pending_exact = 0;
2908 beg_interval = NULL;
2909 }
2910 break;
2911
2912 unfetch_interval:
2913 /* If an invalid interval, match the characters as literals. */
2914 assert (beg_interval);
2915 p = beg_interval;
2916 beg_interval = NULL;
2917
2918 /* normal_char and normal_backslash need `c'. */
2919 c = '{';
2920
2921 if (!(syntax & RE_NO_BK_BRACES))
2922 {
2923 assert (p > pattern && p[-1] == '\\');
2924 goto normal_backslash;
2925 }
2926 else
2927 goto normal_char;
2928
2929 #ifdef emacs
2930 /* There is no way to specify the before_dot and after_dot
2931 operators. rms says this is ok. --karl */
2932 case '=':
2933 BUF_PUSH (at_dot);
2934 break;
2935
2936 case 's':
2937 laststart = b;
2938 PATFETCH (c);
2939 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
2940 break;
2941
2942 case 'S':
2943 laststart = b;
2944 PATFETCH (c);
2945 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
2946 break;
2947
2948 case 'c':
2949 laststart = b;
2950 PATFETCH_RAW (c);
2951 BUF_PUSH_2 (categoryspec, c);
2952 break;
2953
2954 case 'C':
2955 laststart = b;
2956 PATFETCH_RAW (c);
2957 BUF_PUSH_2 (notcategoryspec, c);
2958 break;
2959 #endif /* emacs */
2960
2961
2962 case 'w':
2963 laststart = b;
2964 BUF_PUSH_2 (syntaxspec, Sword);
2965 break;
2966
2967
2968 case 'W':
2969 laststart = b;
2970 BUF_PUSH_2 (notsyntaxspec, Sword);
2971 break;
2972
2973
2974 case '<':
2975 BUF_PUSH (wordbeg);
2976 break;
2977
2978 case '>':
2979 BUF_PUSH (wordend);
2980 break;
2981
2982 case 'b':
2983 BUF_PUSH (wordbound);
2984 break;
2985
2986 case 'B':
2987 BUF_PUSH (notwordbound);
2988 break;
2989
2990 case '`':
2991 BUF_PUSH (begbuf);
2992 break;
2993
2994 case '\'':
2995 BUF_PUSH (endbuf);
2996 break;
2997
2998 case '1': case '2': case '3': case '4': case '5':
2999 case '6': case '7': case '8': case '9':
3000 if (syntax & RE_NO_BK_REFS)
3001 goto normal_char;
3002
3003 c1 = c - '0';
3004
3005 if (c1 > regnum)
3006 FREE_STACK_RETURN (REG_ESUBREG);
3007
3008 /* Can't back reference to a subexpression if inside of it. */
3009 if (group_in_compile_stack (compile_stack, c1))
3010 goto normal_char;
3011
3012 laststart = b;
3013 BUF_PUSH_2 (duplicate, c1);
3014 break;
3015
3016
3017 case '+':
3018 case '?':
3019 if (syntax & RE_BK_PLUS_QM)
3020 goto handle_plus;
3021 else
3022 goto normal_backslash;
3023
3024 default:
3025 normal_backslash:
3026 /* You might think it would be useful for \ to mean
3027 not to translate; but if we don't translate it
3028 it will never match anything. */
3029 c = TRANSLATE (c);
3030 goto normal_char;
3031 }
3032 break;
3033
3034
3035 default:
3036 /* Expects the character in `c'. */
3037 normal_char:
3038 /* If no exactn currently being built. */
3039 if (!pending_exact
3040
3041 /* If last exactn not at current position. */
3042 || pending_exact + *pending_exact + 1 != b
3043
3044 /* We have only one byte following the exactn for the count. */
3045 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
3046
3047 /* If followed by a repetition operator. */
3048 || (p != pend && (*p == '*' || *p == '^'))
3049 || ((syntax & RE_BK_PLUS_QM)
3050 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3051 : p != pend && (*p == '+' || *p == '?'))
3052 || ((syntax & RE_INTERVALS)
3053 && ((syntax & RE_NO_BK_BRACES)
3054 ? p != pend && *p == '{'
3055 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
3056 {
3057 /* Start building a new exactn. */
3058
3059 laststart = b;
3060
3061 BUF_PUSH_2 (exactn, 0);
3062 pending_exact = b - 1;
3063 }
3064
3065 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3066 {
3067 int len;
3068
3069 if (multibyte)
3070 len = CHAR_STRING (c, b);
3071 else
3072 *b = c, len = 1;
3073 b += len;
3074 (*pending_exact) += len;
3075 }
3076
3077 break;
3078 } /* switch (c) */
3079 } /* while p != pend */
3080
3081
3082 /* Through the pattern now. */
3083
3084 FIXUP_ALT_JUMP ();
3085
3086 if (!COMPILE_STACK_EMPTY)
3087 FREE_STACK_RETURN (REG_EPAREN);
3088
3089 /* If we don't want backtracking, force success
3090 the first time we reach the end of the compiled pattern. */
3091 if (syntax & RE_NO_POSIX_BACKTRACKING)
3092 BUF_PUSH (succeed);
3093
3094 free (compile_stack.stack);
3095
3096 /* We have succeeded; set the length of the buffer. */
3097 bufp->used = b - bufp->buffer;
3098
3099 #ifdef DEBUG
3100 if (debug > 0)
3101 {
3102 re_compile_fastmap (bufp);
3103 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3104 print_compiled_pattern (bufp);
3105 }
3106 debug--;
3107 #endif /* DEBUG */
3108
3109 #ifndef MATCH_MAY_ALLOCATE
3110 /* Initialize the failure stack to the largest possible stack. This
3111 isn't necessary unless we're trying to avoid calling alloca in
3112 the search and match routines. */
3113 {
3114 int num_regs = bufp->re_nsub + 1;
3115
3116 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
3117 {
3118 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
3119
3120 if (! fail_stack.stack)
3121 fail_stack.stack
3122 = (fail_stack_elt_t *) malloc (fail_stack.size
3123 * sizeof (fail_stack_elt_t));
3124 else
3125 fail_stack.stack
3126 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3127 (fail_stack.size
3128 * sizeof (fail_stack_elt_t)));
3129 }
3130
3131 regex_grow_registers (num_regs);
3132 }
3133 #endif /* not MATCH_MAY_ALLOCATE */
3134
3135 return REG_NOERROR;
3136 } /* regex_compile */
3137 \f
3138 /* Subroutines for `regex_compile'. */
3139
3140 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3141
3142 static void
3143 store_op1 (op, loc, arg)
3144 re_opcode_t op;
3145 unsigned char *loc;
3146 int arg;
3147 {
3148 *loc = (unsigned char) op;
3149 STORE_NUMBER (loc + 1, arg);
3150 }
3151
3152
3153 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3154
3155 static void
3156 store_op2 (op, loc, arg1, arg2)
3157 re_opcode_t op;
3158 unsigned char *loc;
3159 int arg1, arg2;
3160 {
3161 *loc = (unsigned char) op;
3162 STORE_NUMBER (loc + 1, arg1);
3163 STORE_NUMBER (loc + 3, arg2);
3164 }
3165
3166
3167 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3168 for OP followed by two-byte integer parameter ARG. */
3169
3170 static void
3171 insert_op1 (op, loc, arg, end)
3172 re_opcode_t op;
3173 unsigned char *loc;
3174 int arg;
3175 unsigned char *end;
3176 {
3177 register unsigned char *pfrom = end;
3178 register unsigned char *pto = end + 3;
3179
3180 while (pfrom != loc)
3181 *--pto = *--pfrom;
3182
3183 store_op1 (op, loc, arg);
3184 }
3185
3186
3187 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3188
3189 static void
3190 insert_op2 (op, loc, arg1, arg2, end)
3191 re_opcode_t op;
3192 unsigned char *loc;
3193 int arg1, arg2;
3194 unsigned char *end;
3195 {
3196 register unsigned char *pfrom = end;
3197 register unsigned char *pto = end + 5;
3198
3199 while (pfrom != loc)
3200 *--pto = *--pfrom;
3201
3202 store_op2 (op, loc, arg1, arg2);
3203 }
3204
3205
3206 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3207 after an alternative or a begin-subexpression. We assume there is at
3208 least one character before the ^. */
3209
3210 static boolean
3211 at_begline_loc_p (pattern, p, syntax)
3212 const unsigned char *pattern, *p;
3213 reg_syntax_t syntax;
3214 {
3215 const unsigned char *prev = p - 2;
3216 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3217
3218 return
3219 /* After a subexpression? */
3220 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3221 /* After an alternative? */
3222 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3223 /* After a shy subexpression? */
3224 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3225 && prev[-1] == '?' && prev[-2] == '('
3226 && (syntax & RE_NO_BK_PARENS
3227 || (prev - 3 >= pattern && prev[-3] == '\\')));
3228 }
3229
3230
3231 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3232 at least one character after the $, i.e., `P < PEND'. */
3233
3234 static boolean
3235 at_endline_loc_p (p, pend, syntax)
3236 const unsigned char *p, *pend;
3237 reg_syntax_t syntax;
3238 {
3239 const unsigned char *next = p;
3240 boolean next_backslash = *next == '\\';
3241 const unsigned char *next_next = p + 1 < pend ? p + 1 : 0;
3242
3243 return
3244 /* Before a subexpression? */
3245 (syntax & RE_NO_BK_PARENS ? *next == ')'
3246 : next_backslash && next_next && *next_next == ')')
3247 /* Before an alternative? */
3248 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3249 : next_backslash && next_next && *next_next == '|');
3250 }
3251
3252
3253 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3254 false if it's not. */
3255
3256 static boolean
3257 group_in_compile_stack (compile_stack, regnum)
3258 compile_stack_type compile_stack;
3259 regnum_t regnum;
3260 {
3261 int this_element;
3262
3263 for (this_element = compile_stack.avail - 1;
3264 this_element >= 0;
3265 this_element--)
3266 if (compile_stack.stack[this_element].regnum == regnum)
3267 return true;
3268
3269 return false;
3270 }
3271 \f
3272 /* analyse_first.
3273 If fastmap is non-NULL, go through the pattern and fill fastmap
3274 with all the possible leading chars. If fastmap is NULL, don't
3275 bother filling it up (obviously) and only return whether the
3276 pattern could potentially match the empty string.
3277
3278 Return 1 if p..pend might match the empty string.
3279 Return 0 if p..pend matches at least one char.
3280 Return -1 if p..pend matches at least one char, but fastmap was not
3281 updated accurately.
3282 Return -2 if an error occurred. */
3283
3284 static int
3285 analyse_first (p, pend, fastmap, multibyte)
3286 unsigned char *p, *pend;
3287 char *fastmap;
3288 const int multibyte;
3289 {
3290 int j, k;
3291 boolean not;
3292 #ifdef MATCH_MAY_ALLOCATE
3293 fail_stack_type fail_stack;
3294 #endif
3295 #ifndef REGEX_MALLOC
3296 char *destination;
3297 #endif
3298
3299 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
3300 /* This holds the pointer to the failure stack, when
3301 it is allocated relocatably. */
3302 fail_stack_elt_t *failure_stack_ptr;
3303 #endif
3304
3305 /* Assume that each path through the pattern can be null until
3306 proven otherwise. We set this false at the bottom of switch
3307 statement, to which we get only if a particular path doesn't
3308 match the empty string. */
3309 boolean path_can_be_null = true;
3310
3311 /* If all elements for base leading-codes in fastmap is set, this
3312 flag is set true. */
3313 boolean match_any_multibyte_characters = false;
3314
3315 assert (p);
3316
3317 INIT_FAIL_STACK ();
3318
3319 /* The loop below works as follows:
3320 - It has a working-list kept in the PATTERN_STACK and which basically
3321 starts by only containing a pointer to the first operation.
3322 - If the opcode we're looking at is a match against some set of
3323 chars, then we add those chars to the fastmap and go on to the
3324 next work element from the worklist (done via `break').
3325 - If the opcode is a control operator on the other hand, we either
3326 ignore it (if it's meaningless at this point, such as `start_memory')
3327 or execute it (if it's a jump). If the jump has several destinations
3328 (i.e. `on_failure_jump'), then we push the other destination onto the
3329 worklist.
3330 We guarantee termination by ignoring backward jumps (more or less),
3331 so that `p' is monotonically increasing. More to the point, we
3332 never set `p' (or push) anything `<= p1'. */
3333
3334 /* If can_be_null is set, then the fastmap will not be used anyway. */
3335 while (1)
3336 {
3337 /* `p1' is used as a marker of how far back a `on_failure_jump'
3338 can go without being ignored. It is normally equal to `p'
3339 (which prevents any backward `on_failure_jump') except right
3340 after a plain `jump', to allow patterns such as:
3341 0: jump 10
3342 3..9: <body>
3343 10: on_failure_jump 3
3344 as used for the *? operator. */
3345 unsigned char *p1 = p;
3346
3347 if (p >= pend)
3348 {
3349 if (path_can_be_null)
3350 return (RESET_FAIL_STACK (), 1);
3351
3352 /* We have reached the (effective) end of pattern. */
3353 if (PATTERN_STACK_EMPTY ())
3354 return (RESET_FAIL_STACK (), 0);
3355
3356 p = (unsigned char*) POP_PATTERN_OP ();
3357 path_can_be_null = true;
3358 continue;
3359 }
3360
3361 /* We should never be about to go beyond the end of the pattern. */
3362 assert (p < pend);
3363
3364 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3365 {
3366 case succeed:
3367 p = pend;
3368 continue;
3369
3370 case duplicate:
3371 /* If the first character has to match a backreference, that means
3372 that the group was empty (since it already matched). Since this
3373 is the only case that interests us here, we can assume that the
3374 backreference must match the empty string. */
3375 p++;
3376 continue;
3377
3378
3379 /* Following are the cases which match a character. These end
3380 with `break'. */
3381
3382 case exactn:
3383 if (fastmap)
3384 {
3385 int c = RE_STRING_CHAR (p + 1, pend - p);
3386
3387 if (SINGLE_BYTE_CHAR_P (c))
3388 fastmap[c] = 1;
3389 else
3390 fastmap[p[1]] = 1;
3391 }
3392 break;
3393
3394
3395 case anychar:
3396 /* We could put all the chars except for \n (and maybe \0)
3397 but we don't bother since it is generally not worth it. */
3398 if (!fastmap) break;
3399 return (RESET_FAIL_STACK (), -1);
3400
3401
3402 case charset_not:
3403 /* Chars beyond end of bitmap are possible matches.
3404 All the single-byte codes can occur in multibyte buffers.
3405 So any that are not listed in the charset
3406 are possible matches, even in multibyte buffers. */
3407 if (!fastmap) break;
3408 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
3409 j < (1 << BYTEWIDTH); j++)
3410 fastmap[j] = 1;
3411 /* Fallthrough */
3412 case charset:
3413 if (!fastmap) break;
3414 not = (re_opcode_t) *(p - 1) == charset_not;
3415 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
3416 j >= 0; j--)
3417 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
3418 fastmap[j] = 1;
3419
3420 if ((not && multibyte)
3421 /* Any character set can possibly contain a character
3422 which doesn't match the specified set of characters. */
3423 || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3424 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
3425 /* If we can match a character class, we can match
3426 any character set. */
3427 {
3428 set_fastmap_for_multibyte_characters:
3429 if (match_any_multibyte_characters == false)
3430 {
3431 for (j = 0x80; j < 0xA0; j++) /* XXX */
3432 if (BASE_LEADING_CODE_P (j))
3433 fastmap[j] = 1;
3434 match_any_multibyte_characters = true;
3435 }
3436 }
3437
3438 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
3439 && match_any_multibyte_characters == false)
3440 {
3441 /* Set fastmap[I] 1 where I is a base leading code of each
3442 multibyte character in the range table. */
3443 int c, count;
3444
3445 /* Make P points the range table. `+ 2' is to skip flag
3446 bits for a character class. */
3447 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
3448
3449 /* Extract the number of ranges in range table into COUNT. */
3450 EXTRACT_NUMBER_AND_INCR (count, p);
3451 for (; count > 0; count--, p += 2 * 3) /* XXX */
3452 {
3453 /* Extract the start of each range. */
3454 EXTRACT_CHARACTER (c, p);
3455 j = CHAR_CHARSET (c);
3456 fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1;
3457 }
3458 }
3459 break;
3460
3461 case syntaxspec:
3462 case notsyntaxspec:
3463 if (!fastmap) break;
3464 #ifndef emacs
3465 not = (re_opcode_t)p[-1] == notsyntaxspec;
3466 k = *p++;
3467 for (j = 0; j < (1 << BYTEWIDTH); j++)
3468 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
3469 fastmap[j] = 1;
3470 break;
3471 #else /* emacs */
3472 /* This match depends on text properties. These end with
3473 aborting optimizations. */
3474 return (RESET_FAIL_STACK (), -1);
3475
3476 case categoryspec:
3477 case notcategoryspec:
3478 if (!fastmap) break;
3479 not = (re_opcode_t)p[-1] == notcategoryspec;
3480 k = *p++;
3481 for (j = 0; j < (1 << BYTEWIDTH); j++)
3482 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
3483 fastmap[j] = 1;
3484
3485 if (multibyte)
3486 /* Any character set can possibly contain a character
3487 whose category is K (or not). */
3488 goto set_fastmap_for_multibyte_characters;
3489 break;
3490
3491 /* All cases after this match the empty string. These end with
3492 `continue'. */
3493
3494 case before_dot:
3495 case at_dot:
3496 case after_dot:
3497 #endif /* !emacs */
3498 case no_op:
3499 case begline:
3500 case endline:
3501 case begbuf:
3502 case endbuf:
3503 case wordbound:
3504 case notwordbound:
3505 case wordbeg:
3506 case wordend:
3507 continue;
3508
3509
3510 case jump:
3511 EXTRACT_NUMBER_AND_INCR (j, p);
3512 if (j < 0)
3513 /* Backward jumps can only go back to code that we've already
3514 visited. `re_compile' should make sure this is true. */
3515 break;
3516 p += j;
3517 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
3518 {
3519 case on_failure_jump:
3520 case on_failure_keep_string_jump:
3521 case on_failure_jump_loop:
3522 case on_failure_jump_nastyloop:
3523 case on_failure_jump_smart:
3524 p++;
3525 break;
3526 default:
3527 continue;
3528 };
3529 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
3530 to jump back to "just after here". */
3531 /* Fallthrough */
3532
3533 case on_failure_jump:
3534 case on_failure_keep_string_jump:
3535 case on_failure_jump_nastyloop:
3536 case on_failure_jump_loop:
3537 case on_failure_jump_smart:
3538 EXTRACT_NUMBER_AND_INCR (j, p);
3539 if (p + j <= p1)
3540 ; /* Backward jump to be ignored. */
3541 else if (!PUSH_PATTERN_OP (p + j, fail_stack))
3542 return (RESET_FAIL_STACK (), -2);
3543 continue;
3544
3545
3546 case jump_n:
3547 /* This code simply does not properly handle forward jump_n. */
3548 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
3549 p += 4;
3550 /* jump_n can either jump or fall through. The (backward) jump
3551 case has already been handled, so we only need to look at the
3552 fallthrough case. */
3553 continue;
3554
3555 case succeed_n:
3556 /* If N == 0, it should be an on_failure_jump_loop instead. */
3557 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
3558 p += 4;
3559 /* We only care about one iteration of the loop, so we don't
3560 need to consider the case where this behaves like an
3561 on_failure_jump. */
3562 continue;
3563
3564
3565 case set_number_at:
3566 p += 4;
3567 continue;
3568
3569
3570 case start_memory:
3571 case stop_memory:
3572 p += 1;
3573 continue;
3574
3575
3576 default:
3577 abort (); /* We have listed all the cases. */
3578 } /* switch *p++ */
3579
3580 /* Getting here means we have found the possible starting
3581 characters for one path of the pattern -- and that the empty
3582 string does not match. We need not follow this path further.
3583 Instead, look at the next alternative (remembered on the
3584 stack), or quit if no more. The test at the top of the loop
3585 does these things. */
3586 path_can_be_null = false;
3587 p = pend;
3588 } /* while p */
3589
3590 return (RESET_FAIL_STACK (), 0);
3591 } /* analyse_first */
3592 \f
3593 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3594 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3595 characters can start a string that matches the pattern. This fastmap
3596 is used by re_search to skip quickly over impossible starting points.
3597
3598 Character codes above (1 << BYTEWIDTH) are not represented in the
3599 fastmap, but the leading codes are represented. Thus, the fastmap
3600 indicates which character sets could start a match.
3601
3602 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3603 area as BUFP->fastmap.
3604
3605 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3606 the pattern buffer.
3607
3608 Returns 0 if we succeed, -2 if an internal error. */
3609
3610 int
3611 re_compile_fastmap (bufp)
3612 struct re_pattern_buffer *bufp;
3613 {
3614 char *fastmap = bufp->fastmap;
3615 int analysis;
3616
3617 assert (fastmap && bufp->buffer);
3618
3619 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3620 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3621
3622 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
3623 fastmap, RE_MULTIBYTE_P (bufp));
3624 if (analysis < -1)
3625 return analysis;
3626 bufp->can_be_null = (analysis != 0);
3627 return 0;
3628 } /* re_compile_fastmap */
3629 \f
3630 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
3631 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
3632 this memory for recording register information. STARTS and ENDS
3633 must be allocated using the malloc library routine, and must each
3634 be at least NUM_REGS * sizeof (regoff_t) bytes long.
3635
3636 If NUM_REGS == 0, then subsequent matches should allocate their own
3637 register data.
3638
3639 Unless this function is called, the first search or match using
3640 PATTERN_BUFFER will allocate its own register data, without
3641 freeing the old data. */
3642
3643 void
3644 re_set_registers (bufp, regs, num_regs, starts, ends)
3645 struct re_pattern_buffer *bufp;
3646 struct re_registers *regs;
3647 unsigned num_regs;
3648 regoff_t *starts, *ends;
3649 {
3650 if (num_regs)
3651 {
3652 bufp->regs_allocated = REGS_REALLOCATE;
3653 regs->num_regs = num_regs;
3654 regs->start = starts;
3655 regs->end = ends;
3656 }
3657 else
3658 {
3659 bufp->regs_allocated = REGS_UNALLOCATED;
3660 regs->num_regs = 0;
3661 regs->start = regs->end = (regoff_t *) 0;
3662 }
3663 }
3664 \f
3665 /* Searching routines. */
3666
3667 /* Like re_search_2, below, but only one string is specified, and
3668 doesn't let you say where to stop matching. */
3669
3670 int
3671 re_search (bufp, string, size, startpos, range, regs)
3672 struct re_pattern_buffer *bufp;
3673 const char *string;
3674 int size, startpos, range;
3675 struct re_registers *regs;
3676 {
3677 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
3678 regs, size);
3679 }
3680
3681 /* End address of virtual concatenation of string. */
3682 #define STOP_ADDR_VSTRING(P) \
3683 (((P) >= size1 ? string2 + size2 : string1 + size1))
3684
3685 /* Address of POS in the concatenation of virtual string. */
3686 #define POS_ADDR_VSTRING(POS) \
3687 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
3688
3689 /* Using the compiled pattern in BUFP->buffer, first tries to match the
3690 virtual concatenation of STRING1 and STRING2, starting first at index
3691 STARTPOS, then at STARTPOS + 1, and so on.
3692
3693 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
3694
3695 RANGE is how far to scan while trying to match. RANGE = 0 means try
3696 only at STARTPOS; in general, the last start tried is STARTPOS +
3697 RANGE.
3698
3699 In REGS, return the indices of the virtual concatenation of STRING1
3700 and STRING2 that matched the entire BUFP->buffer and its contained
3701 subexpressions.
3702
3703 Do not consider matching one past the index STOP in the virtual
3704 concatenation of STRING1 and STRING2.
3705
3706 We return either the position in the strings at which the match was
3707 found, -1 if no match, or -2 if error (such as failure
3708 stack overflow). */
3709
3710 int
3711 re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
3712 struct re_pattern_buffer *bufp;
3713 const char *str1, *str2;
3714 int size1, size2;
3715 int startpos;
3716 int range;
3717 struct re_registers *regs;
3718 int stop;
3719 {
3720 int val;
3721 re_char *string1 = (re_char*) str1;
3722 re_char *string2 = (re_char*) str2;
3723 register char *fastmap = bufp->fastmap;
3724 register RE_TRANSLATE_TYPE translate = bufp->translate;
3725 int total_size = size1 + size2;
3726 int endpos = startpos + range;
3727 int anchored_start = 0;
3728
3729 /* Nonzero if we have to concern multibyte character. */
3730 const boolean multibyte = RE_MULTIBYTE_P (bufp);
3731
3732 /* Check for out-of-range STARTPOS. */
3733 if (startpos < 0 || startpos > total_size)
3734 return -1;
3735
3736 /* Fix up RANGE if it might eventually take us outside
3737 the virtual concatenation of STRING1 and STRING2.
3738 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
3739 if (endpos < 0)
3740 range = 0 - startpos;
3741 else if (endpos > total_size)
3742 range = total_size - startpos;
3743
3744 /* If the search isn't to be a backwards one, don't waste time in a
3745 search for a pattern anchored at beginning of buffer. */
3746 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
3747 {
3748 if (startpos > 0)
3749 return -1;
3750 else
3751 range = 0;
3752 }
3753
3754 #ifdef emacs
3755 /* In a forward search for something that starts with \=.
3756 don't keep searching past point. */
3757 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
3758 {
3759 range = PT_BYTE - BEGV_BYTE - startpos;
3760 if (range < 0)
3761 return -1;
3762 }
3763 #endif /* emacs */
3764
3765 /* Update the fastmap now if not correct already. */
3766 if (fastmap && !bufp->fastmap_accurate)
3767 if (re_compile_fastmap (bufp) == -2)
3768 return -2;
3769
3770 /* See whether the pattern is anchored. */
3771 if (bufp->buffer[0] == begline)
3772 anchored_start = 1;
3773
3774 #ifdef emacs
3775 gl_state.object = re_match_object;
3776 {
3777 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
3778
3779 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
3780 }
3781 #endif
3782
3783 /* Loop through the string, looking for a place to start matching. */
3784 for (;;)
3785 {
3786 /* If the pattern is anchored,
3787 skip quickly past places we cannot match.
3788 We don't bother to treat startpos == 0 specially
3789 because that case doesn't repeat. */
3790 if (anchored_start && startpos > 0)
3791 {
3792 if (! (bufp->newline_anchor
3793 && ((startpos <= size1 ? string1[startpos - 1]
3794 : string2[startpos - size1 - 1])
3795 == '\n')))
3796 goto advance;
3797 }
3798
3799 /* If a fastmap is supplied, skip quickly over characters that
3800 cannot be the start of a match. If the pattern can match the
3801 null string, however, we don't need to skip characters; we want
3802 the first null string. */
3803 if (fastmap && startpos < total_size && !bufp->can_be_null)
3804 {
3805 register re_char *d;
3806 register unsigned int buf_ch;
3807
3808 d = POS_ADDR_VSTRING (startpos);
3809
3810 if (range > 0) /* Searching forwards. */
3811 {
3812 register int lim = 0;
3813 int irange = range;
3814
3815 if (startpos < size1 && startpos + range >= size1)
3816 lim = range - (size1 - startpos);
3817
3818 /* Written out as an if-else to avoid testing `translate'
3819 inside the loop. */
3820 if (RE_TRANSLATE_P (translate))
3821 {
3822 if (multibyte)
3823 while (range > lim)
3824 {
3825 int buf_charlen;
3826
3827 buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim,
3828 buf_charlen);
3829
3830 buf_ch = RE_TRANSLATE (translate, buf_ch);
3831 if (buf_ch >= 0400
3832 || fastmap[buf_ch])
3833 break;
3834
3835 range -= buf_charlen;
3836 d += buf_charlen;
3837 }
3838 else
3839 while (range > lim
3840 && !fastmap[RE_TRANSLATE (translate, *d)])
3841 {
3842 d++;
3843 range--;
3844 }
3845 }
3846 else
3847 while (range > lim && !fastmap[*d])
3848 {
3849 d++;
3850 range--;
3851 }
3852
3853 startpos += irange - range;
3854 }
3855 else /* Searching backwards. */
3856 {
3857 int room = (startpos >= size1
3858 ? size2 + size1 - startpos
3859 : size1 - startpos);
3860 buf_ch = RE_STRING_CHAR (d, room);
3861 buf_ch = TRANSLATE (buf_ch);
3862
3863 if (! (buf_ch >= 0400
3864 || fastmap[buf_ch]))
3865 goto advance;
3866 }
3867 }
3868
3869 /* If can't match the null string, and that's all we have left, fail. */
3870 if (range >= 0 && startpos == total_size && fastmap
3871 && !bufp->can_be_null)
3872 return -1;
3873
3874 val = re_match_2_internal (bufp, string1, size1, string2, size2,
3875 startpos, regs, stop);
3876 #ifndef REGEX_MALLOC
3877 #ifdef C_ALLOCA
3878 alloca (0);
3879 #endif
3880 #endif
3881
3882 if (val >= 0)
3883 return startpos;
3884
3885 if (val == -2)
3886 return -2;
3887
3888 advance:
3889 if (!range)
3890 break;
3891 else if (range > 0)
3892 {
3893 /* Update STARTPOS to the next character boundary. */
3894 if (multibyte)
3895 {
3896 re_char *p = POS_ADDR_VSTRING (startpos);
3897 re_char *pend = STOP_ADDR_VSTRING (startpos);
3898 int len = MULTIBYTE_FORM_LENGTH (p, pend - p);
3899
3900 range -= len;
3901 if (range < 0)
3902 break;
3903 startpos += len;
3904 }
3905 else
3906 {
3907 range--;
3908 startpos++;
3909 }
3910 }
3911 else
3912 {
3913 range++;
3914 startpos--;
3915
3916 /* Update STARTPOS to the previous character boundary. */
3917 if (multibyte)
3918 {
3919 re_char *p = POS_ADDR_VSTRING (startpos);
3920 int len = 0;
3921
3922 /* Find the head of multibyte form. */
3923 while (!CHAR_HEAD_P (*p))
3924 p--, len++;
3925
3926 /* Adjust it. */
3927 #if 0 /* XXX */
3928 if (MULTIBYTE_FORM_LENGTH (p, len + 1) != (len + 1))
3929 ;
3930 else
3931 #endif
3932 {
3933 range += len;
3934 if (range > 0)
3935 break;
3936
3937 startpos -= len;
3938 }
3939 }
3940 }
3941 }
3942 return -1;
3943 } /* re_search_2 */
3944 \f
3945 /* Declarations and macros for re_match_2. */
3946
3947 static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
3948 register int len,
3949 RE_TRANSLATE_TYPE translate,
3950 const int multibyte));
3951
3952 /* This converts PTR, a pointer into one of the search strings `string1'
3953 and `string2' into an offset from the beginning of that string. */
3954 #define POINTER_TO_OFFSET(ptr) \
3955 (FIRST_STRING_P (ptr) \
3956 ? ((regoff_t) ((ptr) - string1)) \
3957 : ((regoff_t) ((ptr) - string2 + size1)))
3958
3959 /* Call before fetching a character with *d. This switches over to
3960 string2 if necessary.
3961 Check re_match_2_internal for a discussion of why end_match_2 might
3962 not be within string2 (but be equal to end_match_1 instead). */
3963 #define PREFETCH() \
3964 while (d == dend) \
3965 { \
3966 /* End of string2 => fail. */ \
3967 if (dend == end_match_2) \
3968 goto fail; \
3969 /* End of string1 => advance to string2. */ \
3970 d = string2; \
3971 dend = end_match_2; \
3972 }
3973
3974 /* Call before fetching a char with *d if you already checked other limits.
3975 This is meant for use in lookahead operations like wordend, etc..
3976 where we might need to look at parts of the string that might be
3977 outside of the LIMITs (i.e past `stop'). */
3978 #define PREFETCH_NOLIMIT() \
3979 if (d == end1) \
3980 { \
3981 d = string2; \
3982 dend = end_match_2; \
3983 } \
3984
3985 /* Test if at very beginning or at very end of the virtual concatenation
3986 of `string1' and `string2'. If only one string, it's `string2'. */
3987 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3988 #define AT_STRINGS_END(d) ((d) == end2)
3989
3990
3991 /* Test if D points to a character which is word-constituent. We have
3992 two special cases to check for: if past the end of string1, look at
3993 the first character in string2; and if before the beginning of
3994 string2, look at the last character in string1. */
3995 #define WORDCHAR_P(d) \
3996 (SYNTAX ((d) == end1 ? *string2 \
3997 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3998 == Sword)
3999
4000 /* Disabled due to a compiler bug -- see comment at case wordbound */
4001
4002 /* The comment at case wordbound is following one, but we don't use
4003 AT_WORD_BOUNDARY anymore to support multibyte form.
4004
4005 The DEC Alpha C compiler 3.x generates incorrect code for the
4006 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4007 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
4008 macro and introducing temporary variables works around the bug. */
4009
4010 #if 0
4011 /* Test if the character before D and the one at D differ with respect
4012 to being word-constituent. */
4013 #define AT_WORD_BOUNDARY(d) \
4014 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4015 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4016 #endif
4017
4018 /* Free everything we malloc. */
4019 #ifdef MATCH_MAY_ALLOCATE
4020 #define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4021 #define FREE_VARIABLES() \
4022 do { \
4023 REGEX_FREE_STACK (fail_stack.stack); \
4024 FREE_VAR (regstart); \
4025 FREE_VAR (regend); \
4026 FREE_VAR (best_regstart); \
4027 FREE_VAR (best_regend); \
4028 } while (0)
4029 #else
4030 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4031 #endif /* not MATCH_MAY_ALLOCATE */
4032
4033 \f
4034 /* Optimization routines. */
4035
4036 /* If the operation is a match against one or more chars,
4037 return a pointer to the next operation, else return NULL. */
4038 static unsigned char *
4039 skip_one_char (p)
4040 unsigned char *p;
4041 {
4042 switch (SWITCH_ENUM_CAST (*p++))
4043 {
4044 case anychar:
4045 break;
4046
4047 case exactn:
4048 p += *p + 1;
4049 break;
4050
4051 case charset_not:
4052 case charset:
4053 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4054 {
4055 int mcnt;
4056 p = CHARSET_RANGE_TABLE (p - 1);
4057 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4058 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4059 }
4060 else
4061 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4062 break;
4063
4064 case syntaxspec:
4065 case notsyntaxspec:
4066 #ifdef emacs
4067 case categoryspec:
4068 case notcategoryspec:
4069 #endif /* emacs */
4070 p++;
4071 break;
4072
4073 default:
4074 p = NULL;
4075 }
4076 return p;
4077 }
4078
4079
4080 /* Jump over non-matching operations. */
4081 static unsigned char *
4082 skip_noops (p, pend)
4083 unsigned char *p, *pend;
4084 {
4085 int mcnt;
4086 while (p < pend)
4087 {
4088 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4089 {
4090 case start_memory:
4091 case stop_memory:
4092 p += 2; break;
4093 case no_op:
4094 p += 1; break;
4095 case jump:
4096 p += 1;
4097 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4098 p += mcnt;
4099 break;
4100 default:
4101 return p;
4102 }
4103 }
4104 assert (p == pend);
4105 return p;
4106 }
4107
4108 /* Non-zero if "p1 matches something" implies "p2 fails". */
4109 static int
4110 mutually_exclusive_p (bufp, p1, p2)
4111 struct re_pattern_buffer *bufp;
4112 unsigned char *p1, *p2;
4113 {
4114 re_opcode_t op2;
4115 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4116 unsigned char *pend = bufp->buffer + bufp->used;
4117
4118 assert (p1 >= bufp->buffer && p1 < pend
4119 && p2 >= bufp->buffer && p2 <= pend);
4120
4121 /* Skip over open/close-group commands.
4122 If what follows this loop is a ...+ construct,
4123 look at what begins its body, since we will have to
4124 match at least one of that. */
4125 p2 = skip_noops (p2, pend);
4126 /* The same skip can be done for p1, except that this function
4127 is only used in the case where p1 is a simple match operator. */
4128 /* p1 = skip_noops (p1, pend); */
4129
4130 assert (p1 >= bufp->buffer && p1 < pend
4131 && p2 >= bufp->buffer && p2 <= pend);
4132
4133 op2 = p2 == pend ? succeed : *p2;
4134
4135 switch (SWITCH_ENUM_CAST (op2))
4136 {
4137 case succeed:
4138 case endbuf:
4139 /* If we're at the end of the pattern, we can change. */
4140 if (skip_one_char (p1))
4141 {
4142 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4143 return 1;
4144 }
4145 break;
4146
4147 case endline:
4148 if (!bufp->newline_anchor)
4149 break;
4150 /* Fallthrough */
4151 case exactn:
4152 {
4153 register unsigned int c
4154 = (re_opcode_t) *p2 == endline ? '\n'
4155 : RE_STRING_CHAR(p2 + 2, pend - p2 - 2);
4156
4157 if ((re_opcode_t) *p1 == exactn)
4158 {
4159 if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2))
4160 {
4161 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4162 return 1;
4163 }
4164 }
4165
4166 else if ((re_opcode_t) *p1 == charset
4167 || (re_opcode_t) *p1 == charset_not)
4168 {
4169 int not = (re_opcode_t) *p1 == charset_not;
4170
4171 /* Test if C is listed in charset (or charset_not)
4172 at `p1'. */
4173 if (SINGLE_BYTE_CHAR_P (c))
4174 {
4175 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4176 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4177 not = !not;
4178 }
4179 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4180 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
4181
4182 /* `not' is equal to 1 if c would match, which means
4183 that we can't change to pop_failure_jump. */
4184 if (!not)
4185 {
4186 DEBUG_PRINT1 (" No match => fast loop.\n");
4187 return 1;
4188 }
4189 }
4190 else if ((re_opcode_t) *p1 == anychar
4191 && c == '\n')
4192 {
4193 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4194 return 1;
4195 }
4196 }
4197 break;
4198
4199 case charset:
4200 case charset_not:
4201 {
4202 if ((re_opcode_t) *p1 == exactn)
4203 /* Reuse the code above. */
4204 return mutually_exclusive_p (bufp, p2, p1);
4205
4206
4207 /* It is hard to list up all the character in charset
4208 P2 if it includes multibyte character. Give up in
4209 such case. */
4210 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4211 {
4212 /* Now, we are sure that P2 has no range table.
4213 So, for the size of bitmap in P2, `p2[1]' is
4214 enough. But P1 may have range table, so the
4215 size of bitmap table of P1 is extracted by
4216 using macro `CHARSET_BITMAP_SIZE'.
4217
4218 Since we know that all the character listed in
4219 P2 is ASCII, it is enough to test only bitmap
4220 table of P1. */
4221
4222 if (*p1 == *p2)
4223 {
4224 int idx;
4225 /* We win if the charset inside the loop
4226 has no overlap with the one after the loop. */
4227 for (idx = 0;
4228 (idx < (int) p2[1]
4229 && idx < CHARSET_BITMAP_SIZE (p1));
4230 idx++)
4231 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4232 break;
4233
4234 if (idx == p2[1]
4235 || idx == CHARSET_BITMAP_SIZE (p1))
4236 {
4237 DEBUG_PRINT1 (" No match => fast loop.\n");
4238 return 1;
4239 }
4240 }
4241 else if ((re_opcode_t) *p1 == charset
4242 || (re_opcode_t) *p1 == charset_not)
4243 {
4244 int idx;
4245 /* We win if the charset_not inside the loop lists
4246 every character listed in the charset after. */
4247 for (idx = 0; idx < (int) p2[1]; idx++)
4248 if (! (p2[2 + idx] == 0
4249 || (idx < CHARSET_BITMAP_SIZE (p1)
4250 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4251 break;
4252
4253 if (idx == p2[1])
4254 {
4255 DEBUG_PRINT1 (" No match => fast loop.\n");
4256 return 1;
4257 }
4258 }
4259 }
4260 }
4261
4262 case wordend:
4263 case notsyntaxspec:
4264 return ((re_opcode_t) *p1 == syntaxspec
4265 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4266
4267 case wordbeg:
4268 case syntaxspec:
4269 return ((re_opcode_t) *p1 == notsyntaxspec
4270 && p1[1] == (op2 == wordend ? Sword : p2[1]));
4271
4272 case wordbound:
4273 return (((re_opcode_t) *p1 == notsyntaxspec
4274 || (re_opcode_t) *p1 == syntaxspec)
4275 && p1[1] == Sword);
4276
4277 #ifdef emacs
4278 case categoryspec:
4279 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4280 case notcategoryspec:
4281 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4282 #endif /* emacs */
4283
4284 default:
4285 ;
4286 }
4287
4288 /* Safe default. */
4289 return 0;
4290 }
4291
4292 \f
4293 /* Matching routines. */
4294
4295 #ifndef emacs /* Emacs never uses this. */
4296 /* re_match is like re_match_2 except it takes only a single string. */
4297
4298 int
4299 re_match (bufp, string, size, pos, regs)
4300 struct re_pattern_buffer *bufp;
4301 const char *string;
4302 int size, pos;
4303 struct re_registers *regs;
4304 {
4305 int result = re_match_2_internal (bufp, NULL, 0, string, size,
4306 pos, regs, size);
4307 #if defined (C_ALLOCA) && !defined (REGEX_MALLOC)
4308 alloca (0);
4309 #endif
4310 return result;
4311 }
4312 #endif /* not emacs */
4313
4314 #ifdef emacs
4315 /* In Emacs, this is the string or buffer in which we
4316 are matching. It is used for looking up syntax properties. */
4317 Lisp_Object re_match_object;
4318 #endif
4319
4320 /* re_match_2 matches the compiled pattern in BUFP against the
4321 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4322 and SIZE2, respectively). We start matching at POS, and stop
4323 matching at STOP.
4324
4325 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4326 store offsets for the substring each group matched in REGS. See the
4327 documentation for exactly how many groups we fill.
4328
4329 We return -1 if no match, -2 if an internal error (such as the
4330 failure stack overflowing). Otherwise, we return the length of the
4331 matched substring. */
4332
4333 int
4334 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
4335 struct re_pattern_buffer *bufp;
4336 const char *string1, *string2;
4337 int size1, size2;
4338 int pos;
4339 struct re_registers *regs;
4340 int stop;
4341 {
4342 int result;
4343
4344 #ifdef emacs
4345 int charpos;
4346 gl_state.object = re_match_object;
4347 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
4348 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4349 #endif
4350
4351 result = re_match_2_internal (bufp, string1, size1, string2, size2,
4352 pos, regs, stop);
4353 #if defined (C_ALLOCA) && !defined (REGEX_MALLOC)
4354 alloca (0);
4355 #endif
4356 return result;
4357 }
4358
4359 /* This is a separate function so that we can force an alloca cleanup
4360 afterwards. */
4361 static int
4362 re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
4363 struct re_pattern_buffer *bufp;
4364 re_char *string1, *string2;
4365 int size1, size2;
4366 int pos;
4367 struct re_registers *regs;
4368 int stop;
4369 {
4370 /* General temporaries. */
4371 int mcnt;
4372 boolean not;
4373 unsigned char *p1;
4374
4375 /* Just past the end of the corresponding string. */
4376 re_char *end1, *end2;
4377
4378 /* Pointers into string1 and string2, just past the last characters in
4379 each to consider matching. */
4380 re_char *end_match_1, *end_match_2;
4381
4382 /* Where we are in the data, and the end of the current string. */
4383 re_char *d, *dend;
4384
4385 /* Used sometimes to remember where we were before starting matching
4386 an operator so that we can go back in case of failure. This "atomic"
4387 behavior of matching opcodes is indispensable to the correctness
4388 of the on_failure_keep_string_jump optimization. */
4389 re_char *dfail;
4390
4391 /* Where we are in the pattern, and the end of the pattern. */
4392 unsigned char *p = bufp->buffer;
4393 register unsigned char *pend = p + bufp->used;
4394
4395 /* We use this to map every character in the string. */
4396 RE_TRANSLATE_TYPE translate = bufp->translate;
4397
4398 /* Nonzero if we have to concern multibyte character. */
4399 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4400
4401 /* Failure point stack. Each place that can handle a failure further
4402 down the line pushes a failure point on this stack. It consists of
4403 regstart, and regend for all registers corresponding to
4404 the subexpressions we're currently inside, plus the number of such
4405 registers, and, finally, two char *'s. The first char * is where
4406 to resume scanning the pattern; the second one is where to resume
4407 scanning the strings. */
4408 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4409 fail_stack_type fail_stack;
4410 #endif
4411 #ifdef DEBUG
4412 static unsigned failure_id = 0;
4413 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
4414 #endif
4415
4416 #if defined (REL_ALLOC) && defined (REGEX_MALLOC)
4417 /* This holds the pointer to the failure stack, when
4418 it is allocated relocatably. */
4419 fail_stack_elt_t *failure_stack_ptr;
4420 #endif
4421
4422 /* We fill all the registers internally, independent of what we
4423 return, for use in backreferences. The number here includes
4424 an element for register zero. */
4425 unsigned num_regs = bufp->re_nsub + 1;
4426
4427 /* Information on the contents of registers. These are pointers into
4428 the input strings; they record just what was matched (on this
4429 attempt) by a subexpression part of the pattern, that is, the
4430 regnum-th regstart pointer points to where in the pattern we began
4431 matching and the regnum-th regend points to right after where we
4432 stopped matching the regnum-th subexpression. (The zeroth register
4433 keeps track of what the whole pattern matches.) */
4434 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4435 re_char **regstart, **regend;
4436 #endif
4437
4438 /* The following record the register info as found in the above
4439 variables when we find a match better than any we've seen before.
4440 This happens as we backtrack through the failure points, which in
4441 turn happens only if we have not yet matched the entire string. */
4442 unsigned best_regs_set = false;
4443 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4444 re_char **best_regstart, **best_regend;
4445 #endif
4446
4447 /* Logically, this is `best_regend[0]'. But we don't want to have to
4448 allocate space for that if we're not allocating space for anything
4449 else (see below). Also, we never need info about register 0 for
4450 any of the other register vectors, and it seems rather a kludge to
4451 treat `best_regend' differently than the rest. So we keep track of
4452 the end of the best match so far in a separate variable. We
4453 initialize this to NULL so that when we backtrack the first time
4454 and need to test it, it's not garbage. */
4455 re_char *match_end = NULL;
4456
4457 #ifdef DEBUG
4458 /* Counts the total number of registers pushed. */
4459 unsigned num_regs_pushed = 0;
4460 #endif
4461
4462 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
4463
4464 INIT_FAIL_STACK ();
4465
4466 #ifdef MATCH_MAY_ALLOCATE
4467 /* Do not bother to initialize all the register variables if there are
4468 no groups in the pattern, as it takes a fair amount of time. If
4469 there are groups, we include space for register 0 (the whole
4470 pattern), even though we never use it, since it simplifies the
4471 array indexing. We should fix this. */
4472 if (bufp->re_nsub)
4473 {
4474 regstart = REGEX_TALLOC (num_regs, re_char *);
4475 regend = REGEX_TALLOC (num_regs, re_char *);
4476 best_regstart = REGEX_TALLOC (num_regs, re_char *);
4477 best_regend = REGEX_TALLOC (num_regs, re_char *);
4478
4479 if (!(regstart && regend && best_regstart && best_regend))
4480 {
4481 FREE_VARIABLES ();
4482 return -2;
4483 }
4484 }
4485 else
4486 {
4487 /* We must initialize all our variables to NULL, so that
4488 `FREE_VARIABLES' doesn't try to free them. */
4489 regstart = regend = best_regstart = best_regend = NULL;
4490 }
4491 #endif /* MATCH_MAY_ALLOCATE */
4492
4493 /* The starting position is bogus. */
4494 if (pos < 0 || pos > size1 + size2)
4495 {
4496 FREE_VARIABLES ();
4497 return -1;
4498 }
4499
4500 /* Initialize subexpression text positions to -1 to mark ones that no
4501 start_memory/stop_memory has been seen for. Also initialize the
4502 register information struct. */
4503 for (mcnt = 1; mcnt < num_regs; mcnt++)
4504 regstart[mcnt] = regend[mcnt] = REG_UNSET_VALUE;
4505
4506 /* We move `string1' into `string2' if the latter's empty -- but not if
4507 `string1' is null. */
4508 if (size2 == 0 && string1 != NULL)
4509 {
4510 string2 = string1;
4511 size2 = size1;
4512 string1 = 0;
4513 size1 = 0;
4514 }
4515 end1 = string1 + size1;
4516 end2 = string2 + size2;
4517
4518 /* `p' scans through the pattern as `d' scans through the data.
4519 `dend' is the end of the input string that `d' points within. `d'
4520 is advanced into the following input string whenever necessary, but
4521 this happens before fetching; therefore, at the beginning of the
4522 loop, `d' can be pointing at the end of a string, but it cannot
4523 equal `string2'. */
4524 if (pos >= size1)
4525 {
4526 /* Only match within string2. */
4527 d = string2 + pos - size1;
4528 dend = end_match_2 = string2 + stop - size1;
4529 end_match_1 = end1; /* Just to give it a value. */
4530 }
4531 else
4532 {
4533 if (stop < size1)
4534 {
4535 /* Only match within string1. */
4536 end_match_1 = string1 + stop;
4537 /* BEWARE!
4538 When we reach end_match_1, PREFETCH normally switches to string2.
4539 But in the present case, this means that just doing a PREFETCH
4540 makes us jump from `stop' to `gap' within the string.
4541 What we really want here is for the search to stop as
4542 soon as we hit end_match_1. That's why we set end_match_2
4543 to end_match_1 (since PREFETCH fails as soon as we hit
4544 end_match_2). */
4545 end_match_2 = end_match_1;
4546 }
4547 else
4548 { /* It's important to use this code when stop == size so that
4549 moving `d' from end1 to string2 will not prevent the d == dend
4550 check from catching the end of string. */
4551 end_match_1 = end1;
4552 end_match_2 = string2 + stop - size1;
4553 }
4554 d = string1 + pos;
4555 dend = end_match_1;
4556 }
4557
4558 DEBUG_PRINT1 ("The compiled pattern is: ");
4559 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
4560 DEBUG_PRINT1 ("The string to match is: `");
4561 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
4562 DEBUG_PRINT1 ("'\n");
4563
4564 /* This loops over pattern commands. It exits by returning from the
4565 function if the match is complete, or it drops through if the match
4566 fails at this starting point in the input data. */
4567 for (;;)
4568 {
4569 DEBUG_PRINT2 ("\n%p: ", p);
4570
4571 if (p == pend)
4572 { /* End of pattern means we might have succeeded. */
4573 DEBUG_PRINT1 ("end of pattern ... ");
4574
4575 /* If we haven't matched the entire string, and we want the
4576 longest match, try backtracking. */
4577 if (d != end_match_2)
4578 {
4579 /* 1 if this match ends in the same string (string1 or string2)
4580 as the best previous match. */
4581 boolean same_str_p = (FIRST_STRING_P (match_end)
4582 == FIRST_STRING_P (d));
4583 /* 1 if this match is the best seen so far. */
4584 boolean best_match_p;
4585
4586 /* AIX compiler got confused when this was combined
4587 with the previous declaration. */
4588 if (same_str_p)
4589 best_match_p = d > match_end;
4590 else
4591 best_match_p = !FIRST_STRING_P (d);
4592
4593 DEBUG_PRINT1 ("backtracking.\n");
4594
4595 if (!FAIL_STACK_EMPTY ())
4596 { /* More failure points to try. */
4597
4598 /* If exceeds best match so far, save it. */
4599 if (!best_regs_set || best_match_p)
4600 {
4601 best_regs_set = true;
4602 match_end = d;
4603
4604 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
4605
4606 for (mcnt = 1; mcnt < num_regs; mcnt++)
4607 {
4608 best_regstart[mcnt] = regstart[mcnt];
4609 best_regend[mcnt] = regend[mcnt];
4610 }
4611 }
4612 goto fail;
4613 }
4614
4615 /* If no failure points, don't restore garbage. And if
4616 last match is real best match, don't restore second
4617 best one. */
4618 else if (best_regs_set && !best_match_p)
4619 {
4620 restore_best_regs:
4621 /* Restore best match. It may happen that `dend ==
4622 end_match_1' while the restored d is in string2.
4623 For example, the pattern `x.*y.*z' against the
4624 strings `x-' and `y-z-', if the two strings are
4625 not consecutive in memory. */
4626 DEBUG_PRINT1 ("Restoring best registers.\n");
4627
4628 d = match_end;
4629 dend = ((d >= string1 && d <= end1)
4630 ? end_match_1 : end_match_2);
4631
4632 for (mcnt = 1; mcnt < num_regs; mcnt++)
4633 {
4634 regstart[mcnt] = best_regstart[mcnt];
4635 regend[mcnt] = best_regend[mcnt];
4636 }
4637 }
4638 } /* d != end_match_2 */
4639
4640 succeed_label:
4641 DEBUG_PRINT1 ("Accepting match.\n");
4642
4643 /* If caller wants register contents data back, do it. */
4644 if (regs && !bufp->no_sub)
4645 {
4646 /* Have the register data arrays been allocated? */
4647 if (bufp->regs_allocated == REGS_UNALLOCATED)
4648 { /* No. So allocate them with malloc. We need one
4649 extra element beyond `num_regs' for the `-1' marker
4650 GNU code uses. */
4651 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
4652 regs->start = TALLOC (regs->num_regs, regoff_t);
4653 regs->end = TALLOC (regs->num_regs, regoff_t);
4654 if (regs->start == NULL || regs->end == NULL)
4655 {
4656 FREE_VARIABLES ();
4657 return -2;
4658 }
4659 bufp->regs_allocated = REGS_REALLOCATE;
4660 }
4661 else if (bufp->regs_allocated == REGS_REALLOCATE)
4662 { /* Yes. If we need more elements than were already
4663 allocated, reallocate them. If we need fewer, just
4664 leave it alone. */
4665 if (regs->num_regs < num_regs + 1)
4666 {
4667 regs->num_regs = num_regs + 1;
4668 RETALLOC (regs->start, regs->num_regs, regoff_t);
4669 RETALLOC (regs->end, regs->num_regs, regoff_t);
4670 if (regs->start == NULL || regs->end == NULL)
4671 {
4672 FREE_VARIABLES ();
4673 return -2;
4674 }
4675 }
4676 }
4677 else
4678 {
4679 /* These braces fend off a "empty body in an else-statement"
4680 warning under GCC when assert expands to nothing. */
4681 assert (bufp->regs_allocated == REGS_FIXED);
4682 }
4683
4684 /* Convert the pointer data in `regstart' and `regend' to
4685 indices. Register zero has to be set differently,
4686 since we haven't kept track of any info for it. */
4687 if (regs->num_regs > 0)
4688 {
4689 regs->start[0] = pos;
4690 regs->end[0] = POINTER_TO_OFFSET (d);
4691 }
4692
4693 /* Go through the first `min (num_regs, regs->num_regs)'
4694 registers, since that is all we initialized. */
4695 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
4696 {
4697 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
4698 regs->start[mcnt] = regs->end[mcnt] = -1;
4699 else
4700 {
4701 regs->start[mcnt]
4702 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]);
4703 regs->end[mcnt]
4704 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]);
4705 }
4706 }
4707
4708 /* If the regs structure we return has more elements than
4709 were in the pattern, set the extra elements to -1. If
4710 we (re)allocated the registers, this is the case,
4711 because we always allocate enough to have at least one
4712 -1 at the end. */
4713 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
4714 regs->start[mcnt] = regs->end[mcnt] = -1;
4715 } /* regs && !bufp->no_sub */
4716
4717 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
4718 nfailure_points_pushed, nfailure_points_popped,
4719 nfailure_points_pushed - nfailure_points_popped);
4720 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
4721
4722 mcnt = POINTER_TO_OFFSET (d) - pos;
4723
4724 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
4725
4726 FREE_VARIABLES ();
4727 return mcnt;
4728 }
4729
4730 /* Otherwise match next pattern command. */
4731 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
4732 {
4733 /* Ignore these. Used to ignore the n of succeed_n's which
4734 currently have n == 0. */
4735 case no_op:
4736 DEBUG_PRINT1 ("EXECUTING no_op.\n");
4737 break;
4738
4739 case succeed:
4740 DEBUG_PRINT1 ("EXECUTING succeed.\n");
4741 goto succeed_label;
4742
4743 /* Match the next n pattern characters exactly. The following
4744 byte in the pattern defines n, and the n bytes after that
4745 are the characters to match. */
4746 case exactn:
4747 mcnt = *p++;
4748 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
4749
4750 /* Remember the start point to rollback upon failure. */
4751 dfail = d;
4752
4753 /* This is written out as an if-else so we don't waste time
4754 testing `translate' inside the loop. */
4755 if (RE_TRANSLATE_P (translate))
4756 {
4757 if (multibyte)
4758 do
4759 {
4760 int pat_charlen, buf_charlen;
4761 unsigned int pat_ch, buf_ch;
4762
4763 PREFETCH ();
4764 pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen);
4765 buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4766
4767 if (RE_TRANSLATE (translate, buf_ch)
4768 != pat_ch)
4769 {
4770 d = dfail;
4771 goto fail;
4772 }
4773
4774 p += pat_charlen;
4775 d += buf_charlen;
4776 mcnt -= pat_charlen;
4777 }
4778 while (mcnt > 0);
4779 else
4780 do
4781 {
4782 PREFETCH ();
4783 if (RE_TRANSLATE (translate, *d) != *p++)
4784 {
4785 d = dfail;
4786 goto fail;
4787 }
4788 d++;
4789 }
4790 while (--mcnt);
4791 }
4792 else
4793 {
4794 do
4795 {
4796 PREFETCH ();
4797 if (*d++ != *p++)
4798 {
4799 d = dfail;
4800 goto fail;
4801 }
4802 }
4803 while (--mcnt);
4804 }
4805 break;
4806
4807
4808 /* Match any character except possibly a newline or a null. */
4809 case anychar:
4810 {
4811 int buf_charlen;
4812 unsigned int buf_ch;
4813
4814 DEBUG_PRINT1 ("EXECUTING anychar.\n");
4815
4816 PREFETCH ();
4817 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen);
4818 buf_ch = TRANSLATE (buf_ch);
4819
4820 if ((!(bufp->syntax & RE_DOT_NEWLINE)
4821 && buf_ch == '\n')
4822 || ((bufp->syntax & RE_DOT_NOT_NULL)
4823 && buf_ch == '\000'))
4824 goto fail;
4825
4826 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
4827 d += buf_charlen;
4828 }
4829 break;
4830
4831
4832 case charset:
4833 case charset_not:
4834 {
4835 register unsigned int c;
4836 boolean not = (re_opcode_t) *(p - 1) == charset_not;
4837 int len;
4838
4839 /* Start of actual range_table, or end of bitmap if there is no
4840 range table. */
4841 unsigned char *range_table;
4842
4843 /* Nonzero if there is a range table. */
4844 int range_table_exists;
4845
4846 /* Number of ranges of range table. This is not included
4847 in the initial byte-length of the command. */
4848 int count = 0;
4849
4850 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
4851
4852 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
4853
4854 if (range_table_exists)
4855 {
4856 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
4857 EXTRACT_NUMBER_AND_INCR (count, range_table);
4858 }
4859
4860 PREFETCH ();
4861 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
4862 c = TRANSLATE (c); /* The character to match. */
4863
4864 if (SINGLE_BYTE_CHAR_P (c))
4865 { /* Lookup bitmap. */
4866 /* Cast to `unsigned' instead of `unsigned char' in
4867 case the bit list is a full 32 bytes long. */
4868 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
4869 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4870 not = !not;
4871 }
4872 #ifdef emacs
4873 else if (range_table_exists)
4874 {
4875 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
4876
4877 if ( (class_bits & BIT_ALNUM && ISALNUM (c))
4878 | (class_bits & BIT_ALPHA && ISALPHA (c))
4879 | (class_bits & BIT_ASCII && IS_REAL_ASCII (c))
4880 | (class_bits & BIT_GRAPH && ISGRAPH (c))
4881 | (class_bits & BIT_LOWER && ISLOWER (c))
4882 | (class_bits & BIT_MULTIBYTE && !ISUNIBYTE (c))
4883 | (class_bits & BIT_NONASCII && !IS_REAL_ASCII (c))
4884 | (class_bits & BIT_PRINT && ISPRINT (c))
4885 | (class_bits & BIT_PUNCT && ISPUNCT (c))
4886 | (class_bits & BIT_SPACE && ISSPACE (c))
4887 | (class_bits & BIT_UNIBYTE && ISUNIBYTE (c))
4888 | (class_bits & BIT_UPPER && ISUPPER (c))
4889 | (class_bits & BIT_WORD && ISWORD (c)))
4890 not = !not;
4891 else
4892 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
4893 }
4894 #endif /* emacs */
4895
4896 if (range_table_exists)
4897 p = CHARSET_RANGE_TABLE_END (range_table, count);
4898 else
4899 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
4900
4901 if (!not) goto fail;
4902
4903 d += len;
4904 break;
4905 }
4906
4907
4908 /* The beginning of a group is represented by start_memory.
4909 The argument is the register number. The text
4910 matched within the group is recorded (in the internal
4911 registers data structure) under the register number. */
4912 case start_memory:
4913 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
4914
4915 /* In case we need to undo this operation (via backtracking). */
4916 PUSH_FAILURE_REG ((unsigned int)*p);
4917
4918 regstart[*p] = d;
4919 regend[*p] = REG_UNSET_VALUE; /* probably unnecessary. -sm */
4920 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
4921
4922 /* Move past the register number and inner group count. */
4923 p += 1;
4924 break;
4925
4926
4927 /* The stop_memory opcode represents the end of a group. Its
4928 argument is the same as start_memory's: the register number. */
4929 case stop_memory:
4930 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
4931
4932 assert (!REG_UNSET (regstart[*p]));
4933 /* Strictly speaking, there should be code such as:
4934
4935 assert (REG_UNSET (regend[*p]));
4936 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
4937
4938 But the only info to be pushed is regend[*p] and it is known to
4939 be UNSET, so there really isn't anything to push.
4940 Not pushing anything, on the other hand deprives us from the
4941 guarantee that regend[*p] is UNSET since undoing this operation
4942 will not reset its value properly. This is not important since
4943 the value will only be read on the next start_memory or at
4944 the very end and both events can only happen if this stop_memory
4945 is *not* undone. */
4946
4947 regend[*p] = d;
4948 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
4949
4950 /* Move past the register number and the inner group count. */
4951 p += 1;
4952 break;
4953
4954
4955 /* \<digit> has been turned into a `duplicate' command which is
4956 followed by the numeric value of <digit> as the register number. */
4957 case duplicate:
4958 {
4959 register re_char *d2, *dend2;
4960 int regno = *p++; /* Get which register to match against. */
4961 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
4962
4963 /* Can't back reference a group which we've never matched. */
4964 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
4965 goto fail;
4966
4967 /* Where in input to try to start matching. */
4968 d2 = regstart[regno];
4969
4970 /* Remember the start point to rollback upon failure. */
4971 dfail = d;
4972
4973 /* Where to stop matching; if both the place to start and
4974 the place to stop matching are in the same string, then
4975 set to the place to stop, otherwise, for now have to use
4976 the end of the first string. */
4977
4978 dend2 = ((FIRST_STRING_P (regstart[regno])
4979 == FIRST_STRING_P (regend[regno]))
4980 ? regend[regno] : end_match_1);
4981 for (;;)
4982 {
4983 /* If necessary, advance to next segment in register
4984 contents. */
4985 while (d2 == dend2)
4986 {
4987 if (dend2 == end_match_2) break;
4988 if (dend2 == regend[regno]) break;
4989
4990 /* End of string1 => advance to string2. */
4991 d2 = string2;
4992 dend2 = regend[regno];
4993 }
4994 /* At end of register contents => success */
4995 if (d2 == dend2) break;
4996
4997 /* If necessary, advance to next segment in data. */
4998 PREFETCH ();
4999
5000 /* How many characters left in this segment to match. */
5001 mcnt = dend - d;
5002
5003 /* Want how many consecutive characters we can match in
5004 one shot, so, if necessary, adjust the count. */
5005 if (mcnt > dend2 - d2)
5006 mcnt = dend2 - d2;
5007
5008 /* Compare that many; failure if mismatch, else move
5009 past them. */
5010 if (RE_TRANSLATE_P (translate)
5011 ? bcmp_translate (d, d2, mcnt, translate, multibyte)
5012 : bcmp (d, d2, mcnt))
5013 {
5014 d = dfail;
5015 goto fail;
5016 }
5017 d += mcnt, d2 += mcnt;
5018 }
5019 }
5020 break;
5021
5022
5023 /* begline matches the empty string at the beginning of the string
5024 (unless `not_bol' is set in `bufp'), and, if
5025 `newline_anchor' is set, after newlines. */
5026 case begline:
5027 DEBUG_PRINT1 ("EXECUTING begline.\n");
5028
5029 if (AT_STRINGS_BEG (d))
5030 {
5031 if (!bufp->not_bol) break;
5032 }
5033 else
5034 {
5035 unsigned char c;
5036 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
5037 if (c == '\n' && bufp->newline_anchor)
5038 break;
5039 }
5040 /* In all other cases, we fail. */
5041 goto fail;
5042
5043
5044 /* endline is the dual of begline. */
5045 case endline:
5046 DEBUG_PRINT1 ("EXECUTING endline.\n");
5047
5048 if (AT_STRINGS_END (d))
5049 {
5050 if (!bufp->not_eol) break;
5051 }
5052 else
5053 {
5054 PREFETCH_NOLIMIT ();
5055 if (*d == '\n' && bufp->newline_anchor)
5056 break;
5057 }
5058 goto fail;
5059
5060
5061 /* Match at the very beginning of the data. */
5062 case begbuf:
5063 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5064 if (AT_STRINGS_BEG (d))
5065 break;
5066 goto fail;
5067
5068
5069 /* Match at the very end of the data. */
5070 case endbuf:
5071 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5072 if (AT_STRINGS_END (d))
5073 break;
5074 goto fail;
5075
5076
5077 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5078 pushes NULL as the value for the string on the stack. Then
5079 `POP_FAILURE_POINT' will keep the current value for the
5080 string, instead of restoring it. To see why, consider
5081 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5082 then the . fails against the \n. But the next thing we want
5083 to do is match the \n against the \n; if we restored the
5084 string value, we would be back at the foo.
5085
5086 Because this is used only in specific cases, we don't need to
5087 check all the things that `on_failure_jump' does, to make
5088 sure the right things get saved on the stack. Hence we don't
5089 share its code. The only reason to push anything on the
5090 stack at all is that otherwise we would have to change
5091 `anychar's code to do something besides goto fail in this
5092 case; that seems worse than this. */
5093 case on_failure_keep_string_jump:
5094 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5095 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5096 mcnt, p + mcnt);
5097
5098 PUSH_FAILURE_POINT (p - 3, NULL);
5099 break;
5100
5101 /* A nasty loop is introduced by the non-greedy *? and +?.
5102 With such loops, the stack only ever contains one failure point
5103 at a time, so that a plain on_failure_jump_loop kind of
5104 cycle detection cannot work. Worse yet, such a detection
5105 can not only fail to detect a cycle, but it can also wrongly
5106 detect a cycle (between different instantiations of the same
5107 loop.
5108 So the method used for those nasty loops is a little different:
5109 We use a special cycle-detection-stack-frame which is pushed
5110 when the on_failure_jump_nastyloop failure-point is *popped*.
5111 This special frame thus marks the beginning of one iteration
5112 through the loop and we can hence easily check right here
5113 whether something matched between the beginning and the end of
5114 the loop. */
5115 case on_failure_jump_nastyloop:
5116 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5117 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5118 mcnt, p + mcnt);
5119
5120 assert ((re_opcode_t)p[-4] == no_op);
5121 CHECK_INFINITE_LOOP (p - 4, d);
5122 PUSH_FAILURE_POINT (p - 3, d);
5123 break;
5124
5125
5126 /* Simple loop detecting on_failure_jump: just check on the
5127 failure stack if the same spot was already hit earlier. */
5128 case on_failure_jump_loop:
5129 on_failure:
5130 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5131 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5132 mcnt, p + mcnt);
5133
5134 CHECK_INFINITE_LOOP (p - 3, d);
5135 PUSH_FAILURE_POINT (p - 3, d);
5136 break;
5137
5138
5139 /* Uses of on_failure_jump:
5140
5141 Each alternative starts with an on_failure_jump that points
5142 to the beginning of the next alternative. Each alternative
5143 except the last ends with a jump that in effect jumps past
5144 the rest of the alternatives. (They really jump to the
5145 ending jump of the following alternative, because tensioning
5146 these jumps is a hassle.)
5147
5148 Repeats start with an on_failure_jump that points past both
5149 the repetition text and either the following jump or
5150 pop_failure_jump back to this on_failure_jump. */
5151 case on_failure_jump:
5152 QUIT;
5153 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5154 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5155 mcnt, p + mcnt);
5156
5157 PUSH_FAILURE_POINT (p -3, d);
5158 break;
5159
5160 /* This operation is used for greedy *.
5161 Compare the beginning of the repeat with what in the
5162 pattern follows its end. If we can establish that there
5163 is nothing that they would both match, i.e., that we
5164 would have to backtrack because of (as in, e.g., `a*a')
5165 then we can use a non-backtracking loop based on
5166 on_failure_keep_string_jump instead of on_failure_jump. */
5167 case on_failure_jump_smart:
5168 QUIT;
5169 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5170 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5171 mcnt, p + mcnt);
5172 {
5173 unsigned char *p1 = p; /* Next operation. */
5174 unsigned char *p2 = p + mcnt; /* Destination of the jump. */
5175
5176 p -= 3; /* Reset so that we will re-execute the
5177 instruction once it's been changed. */
5178
5179 EXTRACT_NUMBER (mcnt, p2 - 2);
5180
5181 /* Ensure this is a indeed the trivial kind of loop
5182 we are expecting. */
5183 assert (skip_one_char (p1) == p2 - 3);
5184 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
5185 DEBUG_STATEMENT (debug += 2);
5186 if (mutually_exclusive_p (bufp, p1, p2))
5187 {
5188 /* Use a fast `on_failure_keep_string_jump' loop. */
5189 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
5190 *p = (unsigned char) on_failure_keep_string_jump;
5191 STORE_NUMBER (p2 - 2, mcnt + 3);
5192 }
5193 else
5194 {
5195 /* Default to a safe `on_failure_jump' loop. */
5196 DEBUG_PRINT1 (" smart default => slow loop.\n");
5197 *p = (unsigned char) on_failure_jump;
5198 }
5199 DEBUG_STATEMENT (debug -= 2);
5200 }
5201 break;
5202
5203 /* Unconditionally jump (without popping any failure points). */
5204 case jump:
5205 unconditional_jump:
5206 QUIT;
5207 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5208 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5209 p += mcnt; /* Do the jump. */
5210 DEBUG_PRINT2 ("(to %p).\n", p);
5211 break;
5212
5213
5214 /* Have to succeed matching what follows at least n times.
5215 After that, handle like `on_failure_jump'. */
5216 case succeed_n:
5217 EXTRACT_NUMBER (mcnt, p + 2);
5218 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5219
5220 assert (mcnt >= 0);
5221 /* Originally, this is how many times we HAVE to succeed. */
5222 if (mcnt > 0)
5223 {
5224 mcnt--;
5225 p += 2;
5226 STORE_NUMBER_AND_INCR (p, mcnt);
5227 DEBUG_PRINT3 (" Setting %p to %d.\n", p, mcnt);
5228 }
5229 else if (mcnt == 0)
5230 {
5231 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2);
5232 p[2] = (unsigned char) no_op;
5233 p[3] = (unsigned char) no_op;
5234 goto on_failure;
5235 }
5236 break;
5237
5238 case jump_n:
5239 EXTRACT_NUMBER (mcnt, p + 2);
5240 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5241
5242 /* Originally, this is how many times we CAN jump. */
5243 if (mcnt)
5244 {
5245 mcnt--;
5246 STORE_NUMBER (p + 2, mcnt);
5247 goto unconditional_jump;
5248 }
5249 /* If don't have to jump any more, skip over the rest of command. */
5250 else
5251 p += 4;
5252 break;
5253
5254 case set_number_at:
5255 {
5256 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5257
5258 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5259 p1 = p + mcnt;
5260 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5261 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt);
5262 STORE_NUMBER (p1, mcnt);
5263 break;
5264 }
5265
5266 case wordbound:
5267 case notwordbound:
5268 not = (re_opcode_t) *(p - 1) == notwordbound;
5269 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
5270
5271 /* We SUCCEED (or FAIL) in one of the following cases: */
5272
5273 /* Case 1: D is at the beginning or the end of string. */
5274 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
5275 not = !not;
5276 else
5277 {
5278 /* C1 is the character before D, S1 is the syntax of C1, C2
5279 is the character at D, and S2 is the syntax of C2. */
5280 int c1, c2, s1, s2;
5281 #ifdef emacs
5282 int offset = PTR_TO_OFFSET (d - 1);
5283 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5284 UPDATE_SYNTAX_TABLE (charpos);
5285 #endif
5286 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5287 s1 = SYNTAX (c1);
5288 #ifdef emacs
5289 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
5290 #endif
5291 PREFETCH_NOLIMIT ();
5292 c2 = RE_STRING_CHAR (d, dend - d);
5293 s2 = SYNTAX (c2);
5294
5295 if (/* Case 2: Only one of S1 and S2 is Sword. */
5296 ((s1 == Sword) != (s2 == Sword))
5297 /* Case 3: Both of S1 and S2 are Sword, and macro
5298 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
5299 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
5300 not = !not;
5301 }
5302 if (not)
5303 break;
5304 else
5305 goto fail;
5306
5307 case wordbeg:
5308 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
5309
5310 /* We FAIL in one of the following cases: */
5311
5312 /* Case 1: D is at the end of string. */
5313 if (AT_STRINGS_END (d))
5314 goto fail;
5315 else
5316 {
5317 /* C1 is the character before D, S1 is the syntax of C1, C2
5318 is the character at D, and S2 is the syntax of C2. */
5319 int c1, c2, s1, s2;
5320 #ifdef emacs
5321 int offset = PTR_TO_OFFSET (d);
5322 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5323 UPDATE_SYNTAX_TABLE (charpos);
5324 #endif
5325 PREFETCH ();
5326 c2 = RE_STRING_CHAR (d, dend - d);
5327 s2 = SYNTAX (c2);
5328
5329 /* Case 2: S2 is not Sword. */
5330 if (s2 != Sword)
5331 goto fail;
5332
5333 /* Case 3: D is not at the beginning of string ... */
5334 if (!AT_STRINGS_BEG (d))
5335 {
5336 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5337 #ifdef emacs
5338 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
5339 #endif
5340 s1 = SYNTAX (c1);
5341
5342 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
5343 returns 0. */
5344 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5345 goto fail;
5346 }
5347 }
5348 break;
5349
5350 case wordend:
5351 DEBUG_PRINT1 ("EXECUTING wordend.\n");
5352
5353 /* We FAIL in one of the following cases: */
5354
5355 /* Case 1: D is at the beginning of string. */
5356 if (AT_STRINGS_BEG (d))
5357 goto fail;
5358 else
5359 {
5360 /* C1 is the character before D, S1 is the syntax of C1, C2
5361 is the character at D, and S2 is the syntax of C2. */
5362 int c1, c2, s1, s2;
5363 #ifdef emacs
5364 int offset = PTR_TO_OFFSET (d) - 1;
5365 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5366 UPDATE_SYNTAX_TABLE (charpos);
5367 #endif
5368 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
5369 s1 = SYNTAX (c1);
5370
5371 /* Case 2: S1 is not Sword. */
5372 if (s1 != Sword)
5373 goto fail;
5374
5375 /* Case 3: D is not at the end of string ... */
5376 if (!AT_STRINGS_END (d))
5377 {
5378 PREFETCH_NOLIMIT ();
5379 c2 = RE_STRING_CHAR (d, dend - d);
5380 #ifdef emacs
5381 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
5382 #endif
5383 s2 = SYNTAX (c2);
5384
5385 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
5386 returns 0. */
5387 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
5388 goto fail;
5389 }
5390 }
5391 break;
5392
5393 case syntaxspec:
5394 case notsyntaxspec:
5395 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
5396 mcnt = *p++;
5397 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
5398 PREFETCH ();
5399 #ifdef emacs
5400 {
5401 int offset = PTR_TO_OFFSET (d);
5402 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
5403 UPDATE_SYNTAX_TABLE (pos1);
5404 }
5405 #endif
5406 {
5407 int c, len;
5408
5409 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5410
5411 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
5412 goto fail;
5413 d += len;
5414 }
5415 break;
5416
5417 #ifdef emacs
5418 case before_dot:
5419 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
5420 if (PTR_BYTE_POS (d) >= PT_BYTE)
5421 goto fail;
5422 break;
5423
5424 case at_dot:
5425 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
5426 if (PTR_BYTE_POS (d) != PT_BYTE)
5427 goto fail;
5428 break;
5429
5430 case after_dot:
5431 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
5432 if (PTR_BYTE_POS (d) <= PT_BYTE)
5433 goto fail;
5434 break;
5435
5436 case categoryspec:
5437 case notcategoryspec:
5438 not = (re_opcode_t) *(p - 1) == notcategoryspec;
5439 mcnt = *p++;
5440 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
5441 PREFETCH ();
5442 {
5443 int c, len;
5444 c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len);
5445
5446 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
5447 goto fail;
5448 d += len;
5449 }
5450 break;
5451
5452 #endif /* emacs */
5453
5454 default:
5455 abort ();
5456 }
5457 continue; /* Successfully executed one pattern command; keep going. */
5458
5459
5460 /* We goto here if a matching operation fails. */
5461 fail:
5462 QUIT;
5463 if (!FAIL_STACK_EMPTY ())
5464 {
5465 re_char *str;
5466 unsigned char *pat;
5467 /* A restart point is known. Restore to that state. */
5468 DEBUG_PRINT1 ("\nFAIL:\n");
5469 POP_FAILURE_POINT (str, pat);
5470 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
5471 {
5472 case on_failure_keep_string_jump:
5473 assert (str == NULL);
5474 goto continue_failure_jump;
5475
5476 case on_failure_jump_nastyloop:
5477 assert ((re_opcode_t)pat[-2] == no_op);
5478 PUSH_FAILURE_POINT (pat - 2, str);
5479 /* Fallthrough */
5480
5481 case on_failure_jump_loop:
5482 case on_failure_jump:
5483 case succeed_n:
5484 d = str;
5485 continue_failure_jump:
5486 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
5487 p = pat + mcnt;
5488 break;
5489
5490 case no_op:
5491 /* A special frame used for nastyloops. */
5492 goto fail;
5493
5494 default:
5495 abort();
5496 }
5497
5498 assert (p >= bufp->buffer && p <= pend);
5499
5500 if (d >= string1 && d <= end1)
5501 dend = end_match_1;
5502 }
5503 else
5504 break; /* Matching at this starting point really fails. */
5505 } /* for (;;) */
5506
5507 if (best_regs_set)
5508 goto restore_best_regs;
5509
5510 FREE_VARIABLES ();
5511
5512 return -1; /* Failure to match. */
5513 } /* re_match_2 */
5514 \f
5515 /* Subroutine definitions for re_match_2. */
5516
5517 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
5518 bytes; nonzero otherwise. */
5519
5520 static int
5521 bcmp_translate (s1, s2, len, translate, multibyte)
5522 re_char *s1, *s2;
5523 register int len;
5524 RE_TRANSLATE_TYPE translate;
5525 const int multibyte;
5526 {
5527 register re_char *p1 = s1, *p2 = s2;
5528 re_char *p1_end = s1 + len;
5529 re_char *p2_end = s2 + len;
5530
5531 while (p1 != p1_end && p2 != p2_end)
5532 {
5533 int p1_charlen, p2_charlen;
5534 int p1_ch, p2_ch;
5535
5536 p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen);
5537 p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen);
5538
5539 if (RE_TRANSLATE (translate, p1_ch)
5540 != RE_TRANSLATE (translate, p2_ch))
5541 return 1;
5542
5543 p1 += p1_charlen, p2 += p2_charlen;
5544 }
5545
5546 if (p1 != p1_end || p2 != p2_end)
5547 return 1;
5548
5549 return 0;
5550 }
5551 \f
5552 /* Entry points for GNU code. */
5553
5554 /* re_compile_pattern is the GNU regular expression compiler: it
5555 compiles PATTERN (of length SIZE) and puts the result in BUFP.
5556 Returns 0 if the pattern was valid, otherwise an error string.
5557
5558 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
5559 are set in BUFP on entry.
5560
5561 We call regex_compile to do the actual compilation. */
5562
5563 const char *
5564 re_compile_pattern (pattern, length, bufp)
5565 const char *pattern;
5566 int length;
5567 struct re_pattern_buffer *bufp;
5568 {
5569 reg_errcode_t ret;
5570
5571 /* GNU code is written to assume at least RE_NREGS registers will be set
5572 (and at least one extra will be -1). */
5573 bufp->regs_allocated = REGS_UNALLOCATED;
5574
5575 /* And GNU code determines whether or not to get register information
5576 by passing null for the REGS argument to re_match, etc., not by
5577 setting no_sub. */
5578 bufp->no_sub = 0;
5579
5580 /* Match anchors at newline. */
5581 bufp->newline_anchor = 1;
5582
5583 ret = regex_compile (pattern, length, re_syntax_options, bufp);
5584
5585 if (!ret)
5586 return NULL;
5587 return gettext (re_error_msgid[(int) ret]);
5588 }
5589 \f
5590 /* Entry points compatible with 4.2 BSD regex library. We don't define
5591 them unless specifically requested. */
5592
5593 #if defined (_REGEX_RE_COMP) || defined (_LIBC)
5594
5595 /* BSD has one and only one pattern buffer. */
5596 static struct re_pattern_buffer re_comp_buf;
5597
5598 char *
5599 #ifdef _LIBC
5600 /* Make these definitions weak in libc, so POSIX programs can redefine
5601 these names if they don't use our functions, and still use
5602 regcomp/regexec below without link errors. */
5603 weak_function
5604 #endif
5605 re_comp (s)
5606 const char *s;
5607 {
5608 reg_errcode_t ret;
5609
5610 if (!s)
5611 {
5612 if (!re_comp_buf.buffer)
5613 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5614 return (char *) gettext ("No previous regular expression");
5615 return 0;
5616 }
5617
5618 if (!re_comp_buf.buffer)
5619 {
5620 re_comp_buf.buffer = (unsigned char *) malloc (200);
5621 if (re_comp_buf.buffer == NULL)
5622 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5623 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
5624 re_comp_buf.allocated = 200;
5625
5626 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
5627 if (re_comp_buf.fastmap == NULL)
5628 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5629 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
5630 }
5631
5632 /* Since `re_exec' always passes NULL for the `regs' argument, we
5633 don't need to initialize the pattern buffer fields which affect it. */
5634
5635 /* Match anchors at newlines. */
5636 re_comp_buf.newline_anchor = 1;
5637
5638 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
5639
5640 if (!ret)
5641 return NULL;
5642
5643 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
5644 return (char *) gettext (re_error_msgid[(int) ret]);
5645 }
5646
5647
5648 int
5649 #ifdef _LIBC
5650 weak_function
5651 #endif
5652 re_exec (s)
5653 const char *s;
5654 {
5655 const int len = strlen (s);
5656 return
5657 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
5658 }
5659 #endif /* _REGEX_RE_COMP */
5660 \f
5661 /* POSIX.2 functions. Don't define these for Emacs. */
5662
5663 #ifndef emacs
5664
5665 /* regcomp takes a regular expression as a string and compiles it.
5666
5667 PREG is a regex_t *. We do not expect any fields to be initialized,
5668 since POSIX says we shouldn't. Thus, we set
5669
5670 `buffer' to the compiled pattern;
5671 `used' to the length of the compiled pattern;
5672 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
5673 REG_EXTENDED bit in CFLAGS is set; otherwise, to
5674 RE_SYNTAX_POSIX_BASIC;
5675 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
5676 `fastmap' and `fastmap_accurate' to zero;
5677 `re_nsub' to the number of subexpressions in PATTERN.
5678
5679 PATTERN is the address of the pattern string.
5680
5681 CFLAGS is a series of bits which affect compilation.
5682
5683 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
5684 use POSIX basic syntax.
5685
5686 If REG_NEWLINE is set, then . and [^...] don't match newline.
5687 Also, regexec will try a match beginning after every newline.
5688
5689 If REG_ICASE is set, then we considers upper- and lowercase
5690 versions of letters to be equivalent when matching.
5691
5692 If REG_NOSUB is set, then when PREG is passed to regexec, that
5693 routine will report only success or failure, and nothing about the
5694 registers.
5695
5696 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
5697 the return codes and their meanings.) */
5698
5699 int
5700 regcomp (preg, pattern, cflags)
5701 regex_t *preg;
5702 const char *pattern;
5703 int cflags;
5704 {
5705 reg_errcode_t ret;
5706 unsigned syntax
5707 = (cflags & REG_EXTENDED) ?
5708 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
5709
5710 /* regex_compile will allocate the space for the compiled pattern. */
5711 preg->buffer = 0;
5712 preg->allocated = 0;
5713 preg->used = 0;
5714
5715 /* Don't bother to use a fastmap when searching. This simplifies the
5716 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
5717 characters after newlines into the fastmap. This way, we just try
5718 every character. */
5719 preg->fastmap = 0;
5720
5721 if (cflags & REG_ICASE)
5722 {
5723 unsigned i;
5724
5725 preg->translate
5726 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
5727 * sizeof (*(RE_TRANSLATE_TYPE)0));
5728 if (preg->translate == NULL)
5729 return (int) REG_ESPACE;
5730
5731 /* Map uppercase characters to corresponding lowercase ones. */
5732 for (i = 0; i < CHAR_SET_SIZE; i++)
5733 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
5734 }
5735 else
5736 preg->translate = NULL;
5737
5738 /* If REG_NEWLINE is set, newlines are treated differently. */
5739 if (cflags & REG_NEWLINE)
5740 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
5741 syntax &= ~RE_DOT_NEWLINE;
5742 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
5743 /* It also changes the matching behavior. */
5744 preg->newline_anchor = 1;
5745 }
5746 else
5747 preg->newline_anchor = 0;
5748
5749 preg->no_sub = !!(cflags & REG_NOSUB);
5750
5751 /* POSIX says a null character in the pattern terminates it, so we
5752 can use strlen here in compiling the pattern. */
5753 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
5754
5755 /* POSIX doesn't distinguish between an unmatched open-group and an
5756 unmatched close-group: both are REG_EPAREN. */
5757 if (ret == REG_ERPAREN) ret = REG_EPAREN;
5758
5759 return (int) ret;
5760 }
5761
5762
5763 /* regexec searches for a given pattern, specified by PREG, in the
5764 string STRING.
5765
5766 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
5767 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
5768 least NMATCH elements, and we set them to the offsets of the
5769 corresponding matched substrings.
5770
5771 EFLAGS specifies `execution flags' which affect matching: if
5772 REG_NOTBOL is set, then ^ does not match at the beginning of the
5773 string; if REG_NOTEOL is set, then $ does not match at the end.
5774
5775 We return 0 if we find a match and REG_NOMATCH if not. */
5776
5777 int
5778 regexec (preg, string, nmatch, pmatch, eflags)
5779 const regex_t *preg;
5780 const char *string;
5781 size_t nmatch;
5782 regmatch_t pmatch[];
5783 int eflags;
5784 {
5785 int ret;
5786 struct re_registers regs;
5787 regex_t private_preg;
5788 int len = strlen (string);
5789 boolean want_reg_info = !preg->no_sub && nmatch > 0;
5790
5791 private_preg = *preg;
5792
5793 private_preg.not_bol = !!(eflags & REG_NOTBOL);
5794 private_preg.not_eol = !!(eflags & REG_NOTEOL);
5795
5796 /* The user has told us exactly how many registers to return
5797 information about, via `nmatch'. We have to pass that on to the
5798 matching routines. */
5799 private_preg.regs_allocated = REGS_FIXED;
5800
5801 if (want_reg_info)
5802 {
5803 regs.num_regs = nmatch;
5804 regs.start = TALLOC (nmatch, regoff_t);
5805 regs.end = TALLOC (nmatch, regoff_t);
5806 if (regs.start == NULL || regs.end == NULL)
5807 return (int) REG_NOMATCH;
5808 }
5809
5810 /* Perform the searching operation. */
5811 ret = re_search (&private_preg, string, len,
5812 /* start: */ 0, /* range: */ len,
5813 want_reg_info ? &regs : (struct re_registers *) 0);
5814
5815 /* Copy the register information to the POSIX structure. */
5816 if (want_reg_info)
5817 {
5818 if (ret >= 0)
5819 {
5820 unsigned r;
5821
5822 for (r = 0; r < nmatch; r++)
5823 {
5824 pmatch[r].rm_so = regs.start[r];
5825 pmatch[r].rm_eo = regs.end[r];
5826 }
5827 }
5828
5829 /* If we needed the temporary register info, free the space now. */
5830 free (regs.start);
5831 free (regs.end);
5832 }
5833
5834 /* We want zero return to mean success, unlike `re_search'. */
5835 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
5836 }
5837
5838
5839 /* Returns a message corresponding to an error code, ERRCODE, returned
5840 from either regcomp or regexec. We don't use PREG here. */
5841
5842 size_t
5843 regerror (errcode, preg, errbuf, errbuf_size)
5844 int errcode;
5845 const regex_t *preg;
5846 char *errbuf;
5847 size_t errbuf_size;
5848 {
5849 const char *msg;
5850 size_t msg_size;
5851
5852 if (errcode < 0
5853 || errcode >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
5854 /* Only error codes returned by the rest of the code should be passed
5855 to this routine. If we are given anything else, or if other regex
5856 code generates an invalid error code, then the program has a bug.
5857 Dump core so we can fix it. */
5858 abort ();
5859
5860 msg = gettext (re_error_msgid[errcode]);
5861
5862 msg_size = strlen (msg) + 1; /* Includes the null. */
5863
5864 if (errbuf_size != 0)
5865 {
5866 if (msg_size > errbuf_size)
5867 {
5868 strncpy (errbuf, msg, errbuf_size - 1);
5869 errbuf[errbuf_size - 1] = 0;
5870 }
5871 else
5872 strcpy (errbuf, msg);
5873 }
5874
5875 return msg_size;
5876 }
5877
5878
5879 /* Free dynamically allocated space used by PREG. */
5880
5881 void
5882 regfree (preg)
5883 regex_t *preg;
5884 {
5885 if (preg->buffer != NULL)
5886 free (preg->buffer);
5887 preg->buffer = NULL;
5888
5889 preg->allocated = 0;
5890 preg->used = 0;
5891
5892 if (preg->fastmap != NULL)
5893 free (preg->fastmap);
5894 preg->fastmap = NULL;
5895 preg->fastmap_accurate = 0;
5896
5897 if (preg->translate != NULL)
5898 free (preg->translate);
5899 preg->translate = NULL;
5900 }
5901
5902 #endif /* not emacs */