]> code.delx.au - gnu-emacs/blob - src/regex.c
Convert consecutive FSF copyright years to ranges.
[gnu-emacs] / src / regex.c
1 /* Extended regular expression matching and search library, version
2 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the
3 internationalization features.)
4
5 Copyright (C) 1993-2011
6 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
21 USA. */
22
23 /* TODO:
24 - structure the opcode space into opcode+flag.
25 - merge with glibc's regex.[ch].
26 - replace (succeed_n + jump_n + set_number_at) with something that doesn't
27 need to modify the compiled regexp so that re_match can be reentrant.
28 - get rid of on_failure_jump_smart by doing the optimization in re_comp
29 rather than at run-time, so that re_match can be reentrant.
30 */
31
32 /* AIX requires this to be the first thing in the file. */
33 #if defined _AIX && !defined REGEX_MALLOC
34 #pragma alloca
35 #endif
36
37 #ifdef HAVE_CONFIG_H
38 # include <config.h>
39 #endif
40
41 #if defined STDC_HEADERS && !defined emacs
42 # include <stddef.h>
43 #else
44 /* We need this for `regex.h', and perhaps for the Emacs include files. */
45 # include <sys/types.h>
46 #endif
47
48 /* Whether to use ISO C Amendment 1 wide char functions.
49 Those should not be used for Emacs since it uses its own. */
50 #if defined _LIBC
51 #define WIDE_CHAR_SUPPORT 1
52 #else
53 #define WIDE_CHAR_SUPPORT \
54 (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC && !emacs)
55 #endif
56
57 /* For platform which support the ISO C amendement 1 functionality we
58 support user defined character classes. */
59 #if WIDE_CHAR_SUPPORT
60 /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
61 # include <wchar.h>
62 # include <wctype.h>
63 #endif
64
65 #ifdef _LIBC
66 /* We have to keep the namespace clean. */
67 # define regfree(preg) __regfree (preg)
68 # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef)
69 # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags)
70 # define regerror(err_code, preg, errbuf, errbuf_size) \
71 __regerror(err_code, preg, errbuf, errbuf_size)
72 # define re_set_registers(bu, re, nu, st, en) \
73 __re_set_registers (bu, re, nu, st, en)
74 # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \
75 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
76 # define re_match(bufp, string, size, pos, regs) \
77 __re_match (bufp, string, size, pos, regs)
78 # define re_search(bufp, string, size, startpos, range, regs) \
79 __re_search (bufp, string, size, startpos, range, regs)
80 # define re_compile_pattern(pattern, length, bufp) \
81 __re_compile_pattern (pattern, length, bufp)
82 # define re_set_syntax(syntax) __re_set_syntax (syntax)
83 # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \
84 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop)
85 # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp)
86
87 /* Make sure we call libc's function even if the user overrides them. */
88 # define btowc __btowc
89 # define iswctype __iswctype
90 # define wctype __wctype
91
92 # define WEAK_ALIAS(a,b) weak_alias (a, b)
93
94 /* We are also using some library internals. */
95 # include <locale/localeinfo.h>
96 # include <locale/elem-hash.h>
97 # include <langinfo.h>
98 #else
99 # define WEAK_ALIAS(a,b)
100 #endif
101
102 /* This is for other GNU distributions with internationalized messages. */
103 #if HAVE_LIBINTL_H || defined _LIBC
104 # include <libintl.h>
105 #else
106 # define gettext(msgid) (msgid)
107 #endif
108
109 #ifndef gettext_noop
110 /* This define is so xgettext can find the internationalizable
111 strings. */
112 # define gettext_noop(String) String
113 #endif
114
115 /* The `emacs' switch turns on certain matching commands
116 that make sense only in Emacs. */
117 #ifdef emacs
118
119 # include <setjmp.h>
120 # include "lisp.h"
121 # include "buffer.h"
122
123 /* Make syntax table lookup grant data in gl_state. */
124 # define SYNTAX_ENTRY_VIA_PROPERTY
125
126 # include "syntax.h"
127 # include "character.h"
128 # include "category.h"
129
130 # ifdef malloc
131 # undef malloc
132 # endif
133 # define malloc xmalloc
134 # ifdef realloc
135 # undef realloc
136 # endif
137 # define realloc xrealloc
138 # ifdef free
139 # undef free
140 # endif
141 # define free xfree
142
143 /* Converts the pointer to the char to BEG-based offset from the start. */
144 # define PTR_TO_OFFSET(d) POS_AS_IN_BUFFER (POINTER_TO_OFFSET (d))
145 # define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object)))
146
147 # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
148 # define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
149 # define RE_STRING_CHAR(p, multibyte) \
150 (multibyte ? (STRING_CHAR (p)) : (*(p)))
151 # define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) \
152 (multibyte ? (STRING_CHAR_AND_LENGTH (p, len)) : ((len) = 1, *(p)))
153
154 # define RE_CHAR_TO_MULTIBYTE(c) UNIBYTE_TO_CHAR (c)
155
156 # define RE_CHAR_TO_UNIBYTE(c) CHAR_TO_BYTE_SAFE (c)
157
158 /* Set C a (possibly converted to multibyte) character before P. P
159 points into a string which is the virtual concatenation of STR1
160 (which ends at END1) or STR2 (which ends at END2). */
161 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
162 do { \
163 if (target_multibyte) \
164 { \
165 re_char *dtemp = (p) == (str2) ? (end1) : (p); \
166 re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \
167 while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \
168 c = STRING_CHAR (dtemp); \
169 } \
170 else \
171 { \
172 (c = ((p) == (str2) ? (end1) : (p))[-1]); \
173 (c) = RE_CHAR_TO_MULTIBYTE (c); \
174 } \
175 } while (0)
176
177 /* Set C a (possibly converted to multibyte) character at P, and set
178 LEN to the byte length of that character. */
179 # define GET_CHAR_AFTER(c, p, len) \
180 do { \
181 if (target_multibyte) \
182 (c) = STRING_CHAR_AND_LENGTH (p, len); \
183 else \
184 { \
185 (c) = *p; \
186 len = 1; \
187 (c) = RE_CHAR_TO_MULTIBYTE (c); \
188 } \
189 } while (0)
190
191 #else /* not emacs */
192
193 /* If we are not linking with Emacs proper,
194 we can't use the relocating allocator
195 even if config.h says that we can. */
196 # undef REL_ALLOC
197
198 # include <unistd.h>
199
200 /* When used in Emacs's lib-src, we need xmalloc and xrealloc. */
201
202 void *
203 xmalloc (size_t size)
204 {
205 register void *val;
206 val = (void *) malloc (size);
207 if (!val && size)
208 {
209 write (2, "virtual memory exhausted\n", 25);
210 exit (1);
211 }
212 return val;
213 }
214
215 void *
216 xrealloc (void *block, size_t size)
217 {
218 register void *val;
219 /* We must call malloc explicitly when BLOCK is 0, since some
220 reallocs don't do this. */
221 if (! block)
222 val = (void *) malloc (size);
223 else
224 val = (void *) realloc (block, size);
225 if (!val && size)
226 {
227 write (2, "virtual memory exhausted\n", 25);
228 exit (1);
229 }
230 return val;
231 }
232
233 # ifdef malloc
234 # undef malloc
235 # endif
236 # define malloc xmalloc
237 # ifdef realloc
238 # undef realloc
239 # endif
240 # define realloc xrealloc
241
242 /* This is the normal way of making sure we have memcpy, memcmp and memset. */
243 # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC
244 # include <string.h>
245 # else
246 # include <strings.h>
247 # ifndef memcmp
248 # define memcmp(s1, s2, n) bcmp (s1, s2, n)
249 # endif
250 # ifndef memcpy
251 # define memcpy(d, s, n) (bcopy (s, d, n), (d))
252 # endif
253 # endif
254
255 /* Define the syntax stuff for \<, \>, etc. */
256
257 /* Sword must be nonzero for the wordchar pattern commands in re_match_2. */
258 enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
259
260 # define SWITCH_ENUM_CAST(x) (x)
261
262 /* Dummy macros for non-Emacs environments. */
263 # define CHAR_CHARSET(c) 0
264 # define CHARSET_LEADING_CODE_BASE(c) 0
265 # define MAX_MULTIBYTE_LENGTH 1
266 # define RE_MULTIBYTE_P(x) 0
267 # define RE_TARGET_MULTIBYTE_P(x) 0
268 # define WORD_BOUNDARY_P(c1, c2) (0)
269 # define CHAR_HEAD_P(p) (1)
270 # define SINGLE_BYTE_CHAR_P(c) (1)
271 # define SAME_CHARSET_P(c1, c2) (1)
272 # define BYTES_BY_CHAR_HEAD(p) (1)
273 # define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
274 # define STRING_CHAR(p) (*(p))
275 # define RE_STRING_CHAR(p, multibyte) STRING_CHAR (p)
276 # define CHAR_STRING(c, s) (*(s) = (c), 1)
277 # define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
278 # define RE_STRING_CHAR_AND_LENGTH(p, len, multibyte) STRING_CHAR_AND_LENGTH (p, len)
279 # define RE_CHAR_TO_MULTIBYTE(c) (c)
280 # define RE_CHAR_TO_UNIBYTE(c) (c)
281 # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \
282 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1)))
283 # define GET_CHAR_AFTER(c, p, len) \
284 (c = *p, len = 1)
285 # define MAKE_CHAR(charset, c1, c2) (c1)
286 # define BYTE8_TO_CHAR(c) (c)
287 # define CHAR_BYTE8_P(c) (0)
288 # define CHAR_LEADING_CODE(c) (c)
289
290 #endif /* not emacs */
291
292 #ifndef RE_TRANSLATE
293 # define RE_TRANSLATE(TBL, C) ((unsigned char)(TBL)[C])
294 # define RE_TRANSLATE_P(TBL) (TBL)
295 #endif
296 \f
297 /* Get the interface, including the syntax bits. */
298 #include "regex.h"
299
300 /* isalpha etc. are used for the character classes. */
301 #include <ctype.h>
302
303 #ifdef emacs
304
305 /* 1 if C is an ASCII character. */
306 # define IS_REAL_ASCII(c) ((c) < 0200)
307
308 /* 1 if C is a unibyte character. */
309 # define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P ((c)))
310
311 /* The Emacs definitions should not be directly affected by locales. */
312
313 /* In Emacs, these are only used for single-byte characters. */
314 # define ISDIGIT(c) ((c) >= '0' && (c) <= '9')
315 # define ISCNTRL(c) ((c) < ' ')
316 # define ISXDIGIT(c) (((c) >= '0' && (c) <= '9') \
317 || ((c) >= 'a' && (c) <= 'f') \
318 || ((c) >= 'A' && (c) <= 'F'))
319
320 /* This is only used for single-byte characters. */
321 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
322
323 /* The rest must handle multibyte characters. */
324
325 # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
326 ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \
327 : 1)
328
329 # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
330 ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
331 : 1)
332
333 # define ISALNUM(c) (IS_REAL_ASCII (c) \
334 ? (((c) >= 'a' && (c) <= 'z') \
335 || ((c) >= 'A' && (c) <= 'Z') \
336 || ((c) >= '0' && (c) <= '9')) \
337 : SYNTAX (c) == Sword)
338
339 # define ISALPHA(c) (IS_REAL_ASCII (c) \
340 ? (((c) >= 'a' && (c) <= 'z') \
341 || ((c) >= 'A' && (c) <= 'Z')) \
342 : SYNTAX (c) == Sword)
343
344 # define ISLOWER(c) (LOWERCASEP (c))
345
346 # define ISPUNCT(c) (IS_REAL_ASCII (c) \
347 ? ((c) > ' ' && (c) < 0177 \
348 && !(((c) >= 'a' && (c) <= 'z') \
349 || ((c) >= 'A' && (c) <= 'Z') \
350 || ((c) >= '0' && (c) <= '9'))) \
351 : SYNTAX (c) != Sword)
352
353 # define ISSPACE(c) (SYNTAX (c) == Swhitespace)
354
355 # define ISUPPER(c) (UPPERCASEP (c))
356
357 # define ISWORD(c) (SYNTAX (c) == Sword)
358
359 #else /* not emacs */
360
361 /* Jim Meyering writes:
362
363 "... Some ctype macros are valid only for character codes that
364 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
365 using /bin/cc or gcc but without giving an ansi option). So, all
366 ctype uses should be through macros like ISPRINT... If
367 STDC_HEADERS is defined, then autoconf has verified that the ctype
368 macros don't need to be guarded with references to isascii. ...
369 Defining isascii to 1 should let any compiler worth its salt
370 eliminate the && through constant folding."
371 Solaris defines some of these symbols so we must undefine them first. */
372
373 # undef ISASCII
374 # if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)
375 # define ISASCII(c) 1
376 # else
377 # define ISASCII(c) isascii(c)
378 # endif
379
380 /* 1 if C is an ASCII character. */
381 # define IS_REAL_ASCII(c) ((c) < 0200)
382
383 /* This distinction is not meaningful, except in Emacs. */
384 # define ISUNIBYTE(c) 1
385
386 # ifdef isblank
387 # define ISBLANK(c) (ISASCII (c) && isblank (c))
388 # else
389 # define ISBLANK(c) ((c) == ' ' || (c) == '\t')
390 # endif
391 # ifdef isgraph
392 # define ISGRAPH(c) (ISASCII (c) && isgraph (c))
393 # else
394 # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
395 # endif
396
397 # undef ISPRINT
398 # define ISPRINT(c) (ISASCII (c) && isprint (c))
399 # define ISDIGIT(c) (ISASCII (c) && isdigit (c))
400 # define ISALNUM(c) (ISASCII (c) && isalnum (c))
401 # define ISALPHA(c) (ISASCII (c) && isalpha (c))
402 # define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
403 # define ISLOWER(c) (ISASCII (c) && islower (c))
404 # define ISPUNCT(c) (ISASCII (c) && ispunct (c))
405 # define ISSPACE(c) (ISASCII (c) && isspace (c))
406 # define ISUPPER(c) (ISASCII (c) && isupper (c))
407 # define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
408
409 # define ISWORD(c) ISALPHA(c)
410
411 # ifdef _tolower
412 # define TOLOWER(c) _tolower(c)
413 # else
414 # define TOLOWER(c) tolower(c)
415 # endif
416
417 /* How many characters in the character set. */
418 # define CHAR_SET_SIZE 256
419
420 # ifdef SYNTAX_TABLE
421
422 extern char *re_syntax_table;
423
424 # else /* not SYNTAX_TABLE */
425
426 static char re_syntax_table[CHAR_SET_SIZE];
427
428 static void
429 init_syntax_once (void)
430 {
431 register int c;
432 static int done = 0;
433
434 if (done)
435 return;
436
437 memset (re_syntax_table, 0, sizeof re_syntax_table);
438
439 for (c = 0; c < CHAR_SET_SIZE; ++c)
440 if (ISALNUM (c))
441 re_syntax_table[c] = Sword;
442
443 re_syntax_table['_'] = Ssymbol;
444
445 done = 1;
446 }
447
448 # endif /* not SYNTAX_TABLE */
449
450 # define SYNTAX(c) re_syntax_table[(c)]
451
452 #endif /* not emacs */
453 \f
454 #ifndef NULL
455 # define NULL (void *)0
456 #endif
457
458 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
459 since ours (we hope) works properly with all combinations of
460 machines, compilers, `char' and `unsigned char' argument types.
461 (Per Bothner suggested the basic approach.) */
462 #undef SIGN_EXTEND_CHAR
463 #if __STDC__
464 # define SIGN_EXTEND_CHAR(c) ((signed char) (c))
465 #else /* not __STDC__ */
466 /* As in Harbison and Steele. */
467 # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
468 #endif
469 \f
470 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
471 use `alloca' instead of `malloc'. This is because using malloc in
472 re_search* or re_match* could cause memory leaks when C-g is used in
473 Emacs; also, malloc is slower and causes storage fragmentation. On
474 the other hand, malloc is more portable, and easier to debug.
475
476 Because we sometimes use alloca, some routines have to be macros,
477 not functions -- `alloca'-allocated space disappears at the end of the
478 function it is called in. */
479
480 #ifdef REGEX_MALLOC
481
482 # define REGEX_ALLOCATE malloc
483 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
484 # define REGEX_FREE free
485
486 #else /* not REGEX_MALLOC */
487
488 /* Emacs already defines alloca, sometimes. */
489 # ifndef alloca
490
491 /* Make alloca work the best possible way. */
492 # ifdef __GNUC__
493 # define alloca __builtin_alloca
494 # else /* not __GNUC__ */
495 # ifdef HAVE_ALLOCA_H
496 # include <alloca.h>
497 # endif /* HAVE_ALLOCA_H */
498 # endif /* not __GNUC__ */
499
500 # endif /* not alloca */
501
502 # define REGEX_ALLOCATE alloca
503
504 /* Assumes a `char *destination' variable. */
505 # define REGEX_REALLOCATE(source, osize, nsize) \
506 (destination = (char *) alloca (nsize), \
507 memcpy (destination, source, osize))
508
509 /* No need to do anything to free, after alloca. */
510 # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
511
512 #endif /* not REGEX_MALLOC */
513
514 /* Define how to allocate the failure stack. */
515
516 #if defined REL_ALLOC && defined REGEX_MALLOC
517
518 # define REGEX_ALLOCATE_STACK(size) \
519 r_alloc (&failure_stack_ptr, (size))
520 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
521 r_re_alloc (&failure_stack_ptr, (nsize))
522 # define REGEX_FREE_STACK(ptr) \
523 r_alloc_free (&failure_stack_ptr)
524
525 #else /* not using relocating allocator */
526
527 # ifdef REGEX_MALLOC
528
529 # define REGEX_ALLOCATE_STACK malloc
530 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
531 # define REGEX_FREE_STACK free
532
533 # else /* not REGEX_MALLOC */
534
535 # define REGEX_ALLOCATE_STACK alloca
536
537 # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
538 REGEX_REALLOCATE (source, osize, nsize)
539 /* No need to explicitly free anything. */
540 # define REGEX_FREE_STACK(arg) ((void)0)
541
542 # endif /* not REGEX_MALLOC */
543 #endif /* not using relocating allocator */
544
545
546 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
547 `string1' or just past its end. This works if PTR is NULL, which is
548 a good thing. */
549 #define FIRST_STRING_P(ptr) \
550 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
551
552 /* (Re)Allocate N items of type T using malloc, or fail. */
553 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
554 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
555 #define RETALLOC_IF(addr, n, t) \
556 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
557 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
558
559 #define BYTEWIDTH 8 /* In bits. */
560
561 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
562
563 #undef MAX
564 #undef MIN
565 #define MAX(a, b) ((a) > (b) ? (a) : (b))
566 #define MIN(a, b) ((a) < (b) ? (a) : (b))
567
568 /* Type of source-pattern and string chars. */
569 typedef const unsigned char re_char;
570
571 typedef char boolean;
572 #define false 0
573 #define true 1
574
575 static int re_match_2_internal _RE_ARGS ((struct re_pattern_buffer *bufp,
576 re_char *string1, int size1,
577 re_char *string2, int size2,
578 int pos,
579 struct re_registers *regs,
580 int stop));
581 \f
582 /* These are the command codes that appear in compiled regular
583 expressions. Some opcodes are followed by argument bytes. A
584 command code can specify any interpretation whatsoever for its
585 arguments. Zero bytes may appear in the compiled regular expression. */
586
587 typedef enum
588 {
589 no_op = 0,
590
591 /* Succeed right away--no more backtracking. */
592 succeed,
593
594 /* Followed by one byte giving n, then by n literal bytes. */
595 exactn,
596
597 /* Matches any (more or less) character. */
598 anychar,
599
600 /* Matches any one char belonging to specified set. First
601 following byte is number of bitmap bytes. Then come bytes
602 for a bitmap saying which chars are in. Bits in each byte
603 are ordered low-bit-first. A character is in the set if its
604 bit is 1. A character too large to have a bit in the map is
605 automatically not in the set.
606
607 If the length byte has the 0x80 bit set, then that stuff
608 is followed by a range table:
609 2 bytes of flags for character sets (low 8 bits, high 8 bits)
610 See RANGE_TABLE_WORK_BITS below.
611 2 bytes, the number of pairs that follow (upto 32767)
612 pairs, each 2 multibyte characters,
613 each multibyte character represented as 3 bytes. */
614 charset,
615
616 /* Same parameters as charset, but match any character that is
617 not one of those specified. */
618 charset_not,
619
620 /* Start remembering the text that is matched, for storing in a
621 register. Followed by one byte with the register number, in
622 the range 0 to one less than the pattern buffer's re_nsub
623 field. */
624 start_memory,
625
626 /* Stop remembering the text that is matched and store it in a
627 memory register. Followed by one byte with the register
628 number, in the range 0 to one less than `re_nsub' in the
629 pattern buffer. */
630 stop_memory,
631
632 /* Match a duplicate of something remembered. Followed by one
633 byte containing the register number. */
634 duplicate,
635
636 /* Fail unless at beginning of line. */
637 begline,
638
639 /* Fail unless at end of line. */
640 endline,
641
642 /* Succeeds if at beginning of buffer (if emacs) or at beginning
643 of string to be matched (if not). */
644 begbuf,
645
646 /* Analogously, for end of buffer/string. */
647 endbuf,
648
649 /* Followed by two byte relative address to which to jump. */
650 jump,
651
652 /* Followed by two-byte relative address of place to resume at
653 in case of failure. */
654 on_failure_jump,
655
656 /* Like on_failure_jump, but pushes a placeholder instead of the
657 current string position when executed. */
658 on_failure_keep_string_jump,
659
660 /* Just like `on_failure_jump', except that it checks that we
661 don't get stuck in an infinite loop (matching an empty string
662 indefinitely). */
663 on_failure_jump_loop,
664
665 /* Just like `on_failure_jump_loop', except that it checks for
666 a different kind of loop (the kind that shows up with non-greedy
667 operators). This operation has to be immediately preceded
668 by a `no_op'. */
669 on_failure_jump_nastyloop,
670
671 /* A smart `on_failure_jump' used for greedy * and + operators.
672 It analyses the loop before which it is put and if the
673 loop does not require backtracking, it changes itself to
674 `on_failure_keep_string_jump' and short-circuits the loop,
675 else it just defaults to changing itself into `on_failure_jump'.
676 It assumes that it is pointing to just past a `jump'. */
677 on_failure_jump_smart,
678
679 /* Followed by two-byte relative address and two-byte number n.
680 After matching N times, jump to the address upon failure.
681 Does not work if N starts at 0: use on_failure_jump_loop
682 instead. */
683 succeed_n,
684
685 /* Followed by two-byte relative address, and two-byte number n.
686 Jump to the address N times, then fail. */
687 jump_n,
688
689 /* Set the following two-byte relative address to the
690 subsequent two-byte number. The address *includes* the two
691 bytes of number. */
692 set_number_at,
693
694 wordbeg, /* Succeeds if at word beginning. */
695 wordend, /* Succeeds if at word end. */
696
697 wordbound, /* Succeeds if at a word boundary. */
698 notwordbound, /* Succeeds if not at a word boundary. */
699
700 symbeg, /* Succeeds if at symbol beginning. */
701 symend, /* Succeeds if at symbol end. */
702
703 /* Matches any character whose syntax is specified. Followed by
704 a byte which contains a syntax code, e.g., Sword. */
705 syntaxspec,
706
707 /* Matches any character whose syntax is not that specified. */
708 notsyntaxspec
709
710 #ifdef emacs
711 ,before_dot, /* Succeeds if before point. */
712 at_dot, /* Succeeds if at point. */
713 after_dot, /* Succeeds if after point. */
714
715 /* Matches any character whose category-set contains the specified
716 category. The operator is followed by a byte which contains a
717 category code (mnemonic ASCII character). */
718 categoryspec,
719
720 /* Matches any character whose category-set does not contain the
721 specified category. The operator is followed by a byte which
722 contains the category code (mnemonic ASCII character). */
723 notcategoryspec
724 #endif /* emacs */
725 } re_opcode_t;
726 \f
727 /* Common operations on the compiled pattern. */
728
729 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
730
731 #define STORE_NUMBER(destination, number) \
732 do { \
733 (destination)[0] = (number) & 0377; \
734 (destination)[1] = (number) >> 8; \
735 } while (0)
736
737 /* Same as STORE_NUMBER, except increment DESTINATION to
738 the byte after where the number is stored. Therefore, DESTINATION
739 must be an lvalue. */
740
741 #define STORE_NUMBER_AND_INCR(destination, number) \
742 do { \
743 STORE_NUMBER (destination, number); \
744 (destination) += 2; \
745 } while (0)
746
747 /* Put into DESTINATION a number stored in two contiguous bytes starting
748 at SOURCE. */
749
750 #define EXTRACT_NUMBER(destination, source) \
751 do { \
752 (destination) = *(source) & 0377; \
753 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
754 } while (0)
755
756 #ifdef DEBUG
757 static void extract_number _RE_ARGS ((int *dest, re_char *source));
758 static void
759 extract_number (dest, source)
760 int *dest;
761 re_char *source;
762 {
763 int temp = SIGN_EXTEND_CHAR (*(source + 1));
764 *dest = *source & 0377;
765 *dest += temp << 8;
766 }
767
768 # ifndef EXTRACT_MACROS /* To debug the macros. */
769 # undef EXTRACT_NUMBER
770 # define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
771 # endif /* not EXTRACT_MACROS */
772
773 #endif /* DEBUG */
774
775 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
776 SOURCE must be an lvalue. */
777
778 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
779 do { \
780 EXTRACT_NUMBER (destination, source); \
781 (source) += 2; \
782 } while (0)
783
784 #ifdef DEBUG
785 static void extract_number_and_incr _RE_ARGS ((int *destination,
786 re_char **source));
787 static void
788 extract_number_and_incr (destination, source)
789 int *destination;
790 re_char **source;
791 {
792 extract_number (destination, *source);
793 *source += 2;
794 }
795
796 # ifndef EXTRACT_MACROS
797 # undef EXTRACT_NUMBER_AND_INCR
798 # define EXTRACT_NUMBER_AND_INCR(dest, src) \
799 extract_number_and_incr (&dest, &src)
800 # endif /* not EXTRACT_MACROS */
801
802 #endif /* DEBUG */
803 \f
804 /* Store a multibyte character in three contiguous bytes starting
805 DESTINATION, and increment DESTINATION to the byte after where the
806 character is stored. Therefore, DESTINATION must be an lvalue. */
807
808 #define STORE_CHARACTER_AND_INCR(destination, character) \
809 do { \
810 (destination)[0] = (character) & 0377; \
811 (destination)[1] = ((character) >> 8) & 0377; \
812 (destination)[2] = (character) >> 16; \
813 (destination) += 3; \
814 } while (0)
815
816 /* Put into DESTINATION a character stored in three contiguous bytes
817 starting at SOURCE. */
818
819 #define EXTRACT_CHARACTER(destination, source) \
820 do { \
821 (destination) = ((source)[0] \
822 | ((source)[1] << 8) \
823 | ((source)[2] << 16)); \
824 } while (0)
825
826
827 /* Macros for charset. */
828
829 /* Size of bitmap of charset P in bytes. P is a start of charset,
830 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */
831 #define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F)
832
833 /* Nonzero if charset P has range table. */
834 #define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80)
835
836 /* Return the address of range table of charset P. But not the start
837 of table itself, but the before where the number of ranges is
838 stored. `2 +' means to skip re_opcode_t and size of bitmap,
839 and the 2 bytes of flags at the start of the range table. */
840 #define CHARSET_RANGE_TABLE(p) (&(p)[4 + CHARSET_BITMAP_SIZE (p)])
841
842 /* Extract the bit flags that start a range table. */
843 #define CHARSET_RANGE_TABLE_BITS(p) \
844 ((p)[2 + CHARSET_BITMAP_SIZE (p)] \
845 + (p)[3 + CHARSET_BITMAP_SIZE (p)] * 0x100)
846
847 /* Test if C is listed in the bitmap of charset P. */
848 #define CHARSET_LOOKUP_BITMAP(p, c) \
849 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \
850 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH)))
851
852 /* Return the address of end of RANGE_TABLE. COUNT is number of
853 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2'
854 is start of range and end of range. `* 3' is size of each start
855 and end. */
856 #define CHARSET_RANGE_TABLE_END(range_table, count) \
857 ((range_table) + (count) * 2 * 3)
858
859 /* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in.
860 COUNT is number of ranges in RANGE_TABLE. */
861 #define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \
862 do \
863 { \
864 re_wchar_t range_start, range_end; \
865 re_char *p; \
866 re_char *range_table_end \
867 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \
868 \
869 for (p = (range_table); p < range_table_end; p += 2 * 3) \
870 { \
871 EXTRACT_CHARACTER (range_start, p); \
872 EXTRACT_CHARACTER (range_end, p + 3); \
873 \
874 if (range_start <= (c) && (c) <= range_end) \
875 { \
876 (not) = !(not); \
877 break; \
878 } \
879 } \
880 } \
881 while (0)
882
883 /* Test if C is in range table of CHARSET. The flag NOT is negated if
884 C is listed in it. */
885 #define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \
886 do \
887 { \
888 /* Number of ranges in range table. */ \
889 int count; \
890 re_char *range_table = CHARSET_RANGE_TABLE (charset); \
891 \
892 EXTRACT_NUMBER_AND_INCR (count, range_table); \
893 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \
894 } \
895 while (0)
896 \f
897 /* If DEBUG is defined, Regex prints many voluminous messages about what
898 it is doing (if the variable `debug' is nonzero). If linked with the
899 main program in `iregex.c', you can enter patterns and strings
900 interactively. And if linked with the main program in `main.c' and
901 the other test files, you can run the already-written tests. */
902
903 #ifdef DEBUG
904
905 /* We use standard I/O for debugging. */
906 # include <stdio.h>
907
908 /* It is useful to test things that ``must'' be true when debugging. */
909 # include <assert.h>
910
911 static int debug = -100000;
912
913 # define DEBUG_STATEMENT(e) e
914 # define DEBUG_PRINT1(x) if (debug > 0) printf (x)
915 # define DEBUG_PRINT2(x1, x2) if (debug > 0) printf (x1, x2)
916 # define DEBUG_PRINT3(x1, x2, x3) if (debug > 0) printf (x1, x2, x3)
917 # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug > 0) printf (x1, x2, x3, x4)
918 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
919 if (debug > 0) print_partial_compiled_pattern (s, e)
920 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
921 if (debug > 0) print_double_string (w, s1, sz1, s2, sz2)
922
923
924 /* Print the fastmap in human-readable form. */
925
926 void
927 print_fastmap (fastmap)
928 char *fastmap;
929 {
930 unsigned was_a_range = 0;
931 unsigned i = 0;
932
933 while (i < (1 << BYTEWIDTH))
934 {
935 if (fastmap[i++])
936 {
937 was_a_range = 0;
938 putchar (i - 1);
939 while (i < (1 << BYTEWIDTH) && fastmap[i])
940 {
941 was_a_range = 1;
942 i++;
943 }
944 if (was_a_range)
945 {
946 printf ("-");
947 putchar (i - 1);
948 }
949 }
950 }
951 putchar ('\n');
952 }
953
954
955 /* Print a compiled pattern string in human-readable form, starting at
956 the START pointer into it and ending just before the pointer END. */
957
958 void
959 print_partial_compiled_pattern (start, end)
960 re_char *start;
961 re_char *end;
962 {
963 int mcnt, mcnt2;
964 re_char *p = start;
965 re_char *pend = end;
966
967 if (start == NULL)
968 {
969 fprintf (stderr, "(null)\n");
970 return;
971 }
972
973 /* Loop over pattern commands. */
974 while (p < pend)
975 {
976 fprintf (stderr, "%d:\t", p - start);
977
978 switch ((re_opcode_t) *p++)
979 {
980 case no_op:
981 fprintf (stderr, "/no_op");
982 break;
983
984 case succeed:
985 fprintf (stderr, "/succeed");
986 break;
987
988 case exactn:
989 mcnt = *p++;
990 fprintf (stderr, "/exactn/%d", mcnt);
991 do
992 {
993 fprintf (stderr, "/%c", *p++);
994 }
995 while (--mcnt);
996 break;
997
998 case start_memory:
999 fprintf (stderr, "/start_memory/%d", *p++);
1000 break;
1001
1002 case stop_memory:
1003 fprintf (stderr, "/stop_memory/%d", *p++);
1004 break;
1005
1006 case duplicate:
1007 fprintf (stderr, "/duplicate/%d", *p++);
1008 break;
1009
1010 case anychar:
1011 fprintf (stderr, "/anychar");
1012 break;
1013
1014 case charset:
1015 case charset_not:
1016 {
1017 register int c, last = -100;
1018 register int in_range = 0;
1019 int length = CHARSET_BITMAP_SIZE (p - 1);
1020 int has_range_table = CHARSET_RANGE_TABLE_EXISTS_P (p - 1);
1021
1022 fprintf (stderr, "/charset [%s",
1023 (re_opcode_t) *(p - 1) == charset_not ? "^" : "");
1024
1025 if (p + *p >= pend)
1026 fprintf (stderr, " !extends past end of pattern! ");
1027
1028 for (c = 0; c < 256; c++)
1029 if (c / 8 < length
1030 && (p[1 + (c/8)] & (1 << (c % 8))))
1031 {
1032 /* Are we starting a range? */
1033 if (last + 1 == c && ! in_range)
1034 {
1035 fprintf (stderr, "-");
1036 in_range = 1;
1037 }
1038 /* Have we broken a range? */
1039 else if (last + 1 != c && in_range)
1040 {
1041 fprintf (stderr, "%c", last);
1042 in_range = 0;
1043 }
1044
1045 if (! in_range)
1046 fprintf (stderr, "%c", c);
1047
1048 last = c;
1049 }
1050
1051 if (in_range)
1052 fprintf (stderr, "%c", last);
1053
1054 fprintf (stderr, "]");
1055
1056 p += 1 + length;
1057
1058 if (has_range_table)
1059 {
1060 int count;
1061 fprintf (stderr, "has-range-table");
1062
1063 /* ??? Should print the range table; for now, just skip it. */
1064 p += 2; /* skip range table bits */
1065 EXTRACT_NUMBER_AND_INCR (count, p);
1066 p = CHARSET_RANGE_TABLE_END (p, count);
1067 }
1068 }
1069 break;
1070
1071 case begline:
1072 fprintf (stderr, "/begline");
1073 break;
1074
1075 case endline:
1076 fprintf (stderr, "/endline");
1077 break;
1078
1079 case on_failure_jump:
1080 extract_number_and_incr (&mcnt, &p);
1081 fprintf (stderr, "/on_failure_jump to %d", p + mcnt - start);
1082 break;
1083
1084 case on_failure_keep_string_jump:
1085 extract_number_and_incr (&mcnt, &p);
1086 fprintf (stderr, "/on_failure_keep_string_jump to %d", p + mcnt - start);
1087 break;
1088
1089 case on_failure_jump_nastyloop:
1090 extract_number_and_incr (&mcnt, &p);
1091 fprintf (stderr, "/on_failure_jump_nastyloop to %d", p + mcnt - start);
1092 break;
1093
1094 case on_failure_jump_loop:
1095 extract_number_and_incr (&mcnt, &p);
1096 fprintf (stderr, "/on_failure_jump_loop to %d", p + mcnt - start);
1097 break;
1098
1099 case on_failure_jump_smart:
1100 extract_number_and_incr (&mcnt, &p);
1101 fprintf (stderr, "/on_failure_jump_smart to %d", p + mcnt - start);
1102 break;
1103
1104 case jump:
1105 extract_number_and_incr (&mcnt, &p);
1106 fprintf (stderr, "/jump to %d", p + mcnt - start);
1107 break;
1108
1109 case succeed_n:
1110 extract_number_and_incr (&mcnt, &p);
1111 extract_number_and_incr (&mcnt2, &p);
1112 fprintf (stderr, "/succeed_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
1113 break;
1114
1115 case jump_n:
1116 extract_number_and_incr (&mcnt, &p);
1117 extract_number_and_incr (&mcnt2, &p);
1118 fprintf (stderr, "/jump_n to %d, %d times", p - 2 + mcnt - start, mcnt2);
1119 break;
1120
1121 case set_number_at:
1122 extract_number_and_incr (&mcnt, &p);
1123 extract_number_and_incr (&mcnt2, &p);
1124 fprintf (stderr, "/set_number_at location %d to %d", p - 2 + mcnt - start, mcnt2);
1125 break;
1126
1127 case wordbound:
1128 fprintf (stderr, "/wordbound");
1129 break;
1130
1131 case notwordbound:
1132 fprintf (stderr, "/notwordbound");
1133 break;
1134
1135 case wordbeg:
1136 fprintf (stderr, "/wordbeg");
1137 break;
1138
1139 case wordend:
1140 fprintf (stderr, "/wordend");
1141 break;
1142
1143 case symbeg:
1144 fprintf (stderr, "/symbeg");
1145 break;
1146
1147 case symend:
1148 fprintf (stderr, "/symend");
1149 break;
1150
1151 case syntaxspec:
1152 fprintf (stderr, "/syntaxspec");
1153 mcnt = *p++;
1154 fprintf (stderr, "/%d", mcnt);
1155 break;
1156
1157 case notsyntaxspec:
1158 fprintf (stderr, "/notsyntaxspec");
1159 mcnt = *p++;
1160 fprintf (stderr, "/%d", mcnt);
1161 break;
1162
1163 # ifdef emacs
1164 case before_dot:
1165 fprintf (stderr, "/before_dot");
1166 break;
1167
1168 case at_dot:
1169 fprintf (stderr, "/at_dot");
1170 break;
1171
1172 case after_dot:
1173 fprintf (stderr, "/after_dot");
1174 break;
1175
1176 case categoryspec:
1177 fprintf (stderr, "/categoryspec");
1178 mcnt = *p++;
1179 fprintf (stderr, "/%d", mcnt);
1180 break;
1181
1182 case notcategoryspec:
1183 fprintf (stderr, "/notcategoryspec");
1184 mcnt = *p++;
1185 fprintf (stderr, "/%d", mcnt);
1186 break;
1187 # endif /* emacs */
1188
1189 case begbuf:
1190 fprintf (stderr, "/begbuf");
1191 break;
1192
1193 case endbuf:
1194 fprintf (stderr, "/endbuf");
1195 break;
1196
1197 default:
1198 fprintf (stderr, "?%d", *(p-1));
1199 }
1200
1201 fprintf (stderr, "\n");
1202 }
1203
1204 fprintf (stderr, "%d:\tend of pattern.\n", p - start);
1205 }
1206
1207
1208 void
1209 print_compiled_pattern (bufp)
1210 struct re_pattern_buffer *bufp;
1211 {
1212 re_char *buffer = bufp->buffer;
1213
1214 print_partial_compiled_pattern (buffer, buffer + bufp->used);
1215 printf ("%ld bytes used/%ld bytes allocated.\n",
1216 bufp->used, bufp->allocated);
1217
1218 if (bufp->fastmap_accurate && bufp->fastmap)
1219 {
1220 printf ("fastmap: ");
1221 print_fastmap (bufp->fastmap);
1222 }
1223
1224 printf ("re_nsub: %d\t", bufp->re_nsub);
1225 printf ("regs_alloc: %d\t", bufp->regs_allocated);
1226 printf ("can_be_null: %d\t", bufp->can_be_null);
1227 printf ("no_sub: %d\t", bufp->no_sub);
1228 printf ("not_bol: %d\t", bufp->not_bol);
1229 printf ("not_eol: %d\t", bufp->not_eol);
1230 printf ("syntax: %lx\n", bufp->syntax);
1231 fflush (stdout);
1232 /* Perhaps we should print the translate table? */
1233 }
1234
1235
1236 void
1237 print_double_string (where, string1, size1, string2, size2)
1238 re_char *where;
1239 re_char *string1;
1240 re_char *string2;
1241 int size1;
1242 int size2;
1243 {
1244 int this_char;
1245
1246 if (where == NULL)
1247 printf ("(null)");
1248 else
1249 {
1250 if (FIRST_STRING_P (where))
1251 {
1252 for (this_char = where - string1; this_char < size1; this_char++)
1253 putchar (string1[this_char]);
1254
1255 where = string2;
1256 }
1257
1258 for (this_char = where - string2; this_char < size2; this_char++)
1259 putchar (string2[this_char]);
1260 }
1261 }
1262
1263 #else /* not DEBUG */
1264
1265 # undef assert
1266 # define assert(e)
1267
1268 # define DEBUG_STATEMENT(e)
1269 # define DEBUG_PRINT1(x)
1270 # define DEBUG_PRINT2(x1, x2)
1271 # define DEBUG_PRINT3(x1, x2, x3)
1272 # define DEBUG_PRINT4(x1, x2, x3, x4)
1273 # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1274 # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1275
1276 #endif /* not DEBUG */
1277 \f
1278 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1279 also be assigned to arbitrarily: each pattern buffer stores its own
1280 syntax, so it can be changed between regex compilations. */
1281 /* This has no initializer because initialized variables in Emacs
1282 become read-only after dumping. */
1283 reg_syntax_t re_syntax_options;
1284
1285
1286 /* Specify the precise syntax of regexps for compilation. This provides
1287 for compatibility for various utilities which historically have
1288 different, incompatible syntaxes.
1289
1290 The argument SYNTAX is a bit mask comprised of the various bits
1291 defined in regex.h. We return the old syntax. */
1292
1293 reg_syntax_t
1294 re_set_syntax (reg_syntax_t syntax)
1295 {
1296 reg_syntax_t ret = re_syntax_options;
1297
1298 re_syntax_options = syntax;
1299 return ret;
1300 }
1301 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
1302
1303 /* Regexp to use to replace spaces, or NULL meaning don't. */
1304 static re_char *whitespace_regexp;
1305
1306 void
1307 re_set_whitespace_regexp (const char *regexp)
1308 {
1309 whitespace_regexp = (re_char *) regexp;
1310 }
1311 WEAK_ALIAS (__re_set_syntax, re_set_syntax)
1312 \f
1313 /* This table gives an error message for each of the error codes listed
1314 in regex.h. Obviously the order here has to be same as there.
1315 POSIX doesn't require that we do anything for REG_NOERROR,
1316 but why not be nice? */
1317
1318 static const char *re_error_msgid[] =
1319 {
1320 gettext_noop ("Success"), /* REG_NOERROR */
1321 gettext_noop ("No match"), /* REG_NOMATCH */
1322 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
1323 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
1324 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
1325 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
1326 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
1327 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
1328 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
1329 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
1330 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
1331 gettext_noop ("Invalid range end"), /* REG_ERANGE */
1332 gettext_noop ("Memory exhausted"), /* REG_ESPACE */
1333 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
1334 gettext_noop ("Premature end of regular expression"), /* REG_EEND */
1335 gettext_noop ("Regular expression too big"), /* REG_ESIZE */
1336 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
1337 gettext_noop ("Range striding over charsets") /* REG_ERANGEX */
1338 };
1339 \f
1340 /* Avoiding alloca during matching, to placate r_alloc. */
1341
1342 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1343 searching and matching functions should not call alloca. On some
1344 systems, alloca is implemented in terms of malloc, and if we're
1345 using the relocating allocator routines, then malloc could cause a
1346 relocation, which might (if the strings being searched are in the
1347 ralloc heap) shift the data out from underneath the regexp
1348 routines.
1349
1350 Here's another reason to avoid allocation: Emacs
1351 processes input from X in a signal handler; processing X input may
1352 call malloc; if input arrives while a matching routine is calling
1353 malloc, then we're scrod. But Emacs can't just block input while
1354 calling matching routines; then we don't notice interrupts when
1355 they come in. So, Emacs blocks input around all regexp calls
1356 except the matching calls, which it leaves unprotected, in the
1357 faith that they will not malloc. */
1358
1359 /* Normally, this is fine. */
1360 #define MATCH_MAY_ALLOCATE
1361
1362 /* The match routines may not allocate if (1) they would do it with malloc
1363 and (2) it's not safe for them to use malloc.
1364 Note that if REL_ALLOC is defined, matching would not use malloc for the
1365 failure stack, but we would still use it for the register vectors;
1366 so REL_ALLOC should not affect this. */
1367 #if defined REGEX_MALLOC && defined emacs
1368 # undef MATCH_MAY_ALLOCATE
1369 #endif
1370
1371 \f
1372 /* Failure stack declarations and macros; both re_compile_fastmap and
1373 re_match_2 use a failure stack. These have to be macros because of
1374 REGEX_ALLOCATE_STACK. */
1375
1376
1377 /* Approximate number of failure points for which to initially allocate space
1378 when matching. If this number is exceeded, we allocate more
1379 space, so it is not a hard limit. */
1380 #ifndef INIT_FAILURE_ALLOC
1381 # define INIT_FAILURE_ALLOC 20
1382 #endif
1383
1384 /* Roughly the maximum number of failure points on the stack. Would be
1385 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.
1386 This is a variable only so users of regex can assign to it; we never
1387 change it ourselves. We always multiply it by TYPICAL_FAILURE_SIZE
1388 before using it, so it should probably be a byte-count instead. */
1389 # if defined MATCH_MAY_ALLOCATE
1390 /* Note that 4400 was enough to cause a crash on Alpha OSF/1,
1391 whose default stack limit is 2mb. In order for a larger
1392 value to work reliably, you have to try to make it accord
1393 with the process stack limit. */
1394 size_t re_max_failures = 40000;
1395 # else
1396 size_t re_max_failures = 4000;
1397 # endif
1398
1399 union fail_stack_elt
1400 {
1401 re_char *pointer;
1402 /* This should be the biggest `int' that's no bigger than a pointer. */
1403 long integer;
1404 };
1405
1406 typedef union fail_stack_elt fail_stack_elt_t;
1407
1408 typedef struct
1409 {
1410 fail_stack_elt_t *stack;
1411 size_t size;
1412 size_t avail; /* Offset of next open position. */
1413 size_t frame; /* Offset of the cur constructed frame. */
1414 } fail_stack_type;
1415
1416 #define FAIL_STACK_EMPTY() (fail_stack.frame == 0)
1417 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1418
1419
1420 /* Define macros to initialize and free the failure stack.
1421 Do `return -2' if the alloc fails. */
1422
1423 #ifdef MATCH_MAY_ALLOCATE
1424 # define INIT_FAIL_STACK() \
1425 do { \
1426 fail_stack.stack = (fail_stack_elt_t *) \
1427 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \
1428 * sizeof (fail_stack_elt_t)); \
1429 \
1430 if (fail_stack.stack == NULL) \
1431 return -2; \
1432 \
1433 fail_stack.size = INIT_FAILURE_ALLOC; \
1434 fail_stack.avail = 0; \
1435 fail_stack.frame = 0; \
1436 } while (0)
1437
1438 # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1439 #else
1440 # define INIT_FAIL_STACK() \
1441 do { \
1442 fail_stack.avail = 0; \
1443 fail_stack.frame = 0; \
1444 } while (0)
1445
1446 # define RESET_FAIL_STACK() ((void)0)
1447 #endif
1448
1449
1450 /* Double the size of FAIL_STACK, up to a limit
1451 which allows approximately `re_max_failures' items.
1452
1453 Return 1 if succeeds, and 0 if either ran out of memory
1454 allocating space for it or it was already too large.
1455
1456 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1457
1458 /* Factor to increase the failure stack size by
1459 when we increase it.
1460 This used to be 2, but 2 was too wasteful
1461 because the old discarded stacks added up to as much space
1462 were as ultimate, maximum-size stack. */
1463 #define FAIL_STACK_GROWTH_FACTOR 4
1464
1465 #define GROW_FAIL_STACK(fail_stack) \
1466 (((fail_stack).size * sizeof (fail_stack_elt_t) \
1467 >= re_max_failures * TYPICAL_FAILURE_SIZE) \
1468 ? 0 \
1469 : ((fail_stack).stack \
1470 = (fail_stack_elt_t *) \
1471 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1472 (fail_stack).size * sizeof (fail_stack_elt_t), \
1473 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1474 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1475 * FAIL_STACK_GROWTH_FACTOR))), \
1476 \
1477 (fail_stack).stack == NULL \
1478 ? 0 \
1479 : ((fail_stack).size \
1480 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \
1481 ((fail_stack).size * sizeof (fail_stack_elt_t) \
1482 * FAIL_STACK_GROWTH_FACTOR)) \
1483 / sizeof (fail_stack_elt_t)), \
1484 1)))
1485
1486
1487 /* Push a pointer value onto the failure stack.
1488 Assumes the variable `fail_stack'. Probably should only
1489 be called from within `PUSH_FAILURE_POINT'. */
1490 #define PUSH_FAILURE_POINTER(item) \
1491 fail_stack.stack[fail_stack.avail++].pointer = (item)
1492
1493 /* This pushes an integer-valued item onto the failure stack.
1494 Assumes the variable `fail_stack'. Probably should only
1495 be called from within `PUSH_FAILURE_POINT'. */
1496 #define PUSH_FAILURE_INT(item) \
1497 fail_stack.stack[fail_stack.avail++].integer = (item)
1498
1499 /* Push a fail_stack_elt_t value onto the failure stack.
1500 Assumes the variable `fail_stack'. Probably should only
1501 be called from within `PUSH_FAILURE_POINT'. */
1502 #define PUSH_FAILURE_ELT(item) \
1503 fail_stack.stack[fail_stack.avail++] = (item)
1504
1505 /* These three POP... operations complement the three PUSH... operations.
1506 All assume that `fail_stack' is nonempty. */
1507 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1508 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1509 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1510
1511 /* Individual items aside from the registers. */
1512 #define NUM_NONREG_ITEMS 3
1513
1514 /* Used to examine the stack (to detect infinite loops). */
1515 #define FAILURE_PAT(h) fail_stack.stack[(h) - 1].pointer
1516 #define FAILURE_STR(h) (fail_stack.stack[(h) - 2].pointer)
1517 #define NEXT_FAILURE_HANDLE(h) fail_stack.stack[(h) - 3].integer
1518 #define TOP_FAILURE_HANDLE() fail_stack.frame
1519
1520
1521 #define ENSURE_FAIL_STACK(space) \
1522 while (REMAINING_AVAIL_SLOTS <= space) { \
1523 if (!GROW_FAIL_STACK (fail_stack)) \
1524 return -2; \
1525 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", (fail_stack).size);\
1526 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
1527 }
1528
1529 /* Push register NUM onto the stack. */
1530 #define PUSH_FAILURE_REG(num) \
1531 do { \
1532 char *destination; \
1533 ENSURE_FAIL_STACK(3); \
1534 DEBUG_PRINT4 (" Push reg %d (spanning %p -> %p)\n", \
1535 num, regstart[num], regend[num]); \
1536 PUSH_FAILURE_POINTER (regstart[num]); \
1537 PUSH_FAILURE_POINTER (regend[num]); \
1538 PUSH_FAILURE_INT (num); \
1539 } while (0)
1540
1541 /* Change the counter's value to VAL, but make sure that it will
1542 be reset when backtracking. */
1543 #define PUSH_NUMBER(ptr,val) \
1544 do { \
1545 char *destination; \
1546 int c; \
1547 ENSURE_FAIL_STACK(3); \
1548 EXTRACT_NUMBER (c, ptr); \
1549 DEBUG_PRINT4 (" Push number %p = %d -> %d\n", ptr, c, val); \
1550 PUSH_FAILURE_INT (c); \
1551 PUSH_FAILURE_POINTER (ptr); \
1552 PUSH_FAILURE_INT (-1); \
1553 STORE_NUMBER (ptr, val); \
1554 } while (0)
1555
1556 /* Pop a saved register off the stack. */
1557 #define POP_FAILURE_REG_OR_COUNT() \
1558 do { \
1559 int reg = POP_FAILURE_INT (); \
1560 if (reg == -1) \
1561 { \
1562 /* It's a counter. */ \
1563 /* Here, we discard `const', making re_match non-reentrant. */ \
1564 unsigned char *ptr = (unsigned char*) POP_FAILURE_POINTER (); \
1565 reg = POP_FAILURE_INT (); \
1566 STORE_NUMBER (ptr, reg); \
1567 DEBUG_PRINT3 (" Pop counter %p = %d\n", ptr, reg); \
1568 } \
1569 else \
1570 { \
1571 regend[reg] = POP_FAILURE_POINTER (); \
1572 regstart[reg] = POP_FAILURE_POINTER (); \
1573 DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
1574 reg, regstart[reg], regend[reg]); \
1575 } \
1576 } while (0)
1577
1578 /* Check that we are not stuck in an infinite loop. */
1579 #define CHECK_INFINITE_LOOP(pat_cur, string_place) \
1580 do { \
1581 int failure = TOP_FAILURE_HANDLE (); \
1582 /* Check for infinite matching loops */ \
1583 while (failure > 0 \
1584 && (FAILURE_STR (failure) == string_place \
1585 || FAILURE_STR (failure) == NULL)) \
1586 { \
1587 assert (FAILURE_PAT (failure) >= bufp->buffer \
1588 && FAILURE_PAT (failure) <= bufp->buffer + bufp->used); \
1589 if (FAILURE_PAT (failure) == pat_cur) \
1590 { \
1591 cycle = 1; \
1592 break; \
1593 } \
1594 DEBUG_PRINT2 (" Other pattern: %p\n", FAILURE_PAT (failure)); \
1595 failure = NEXT_FAILURE_HANDLE(failure); \
1596 } \
1597 DEBUG_PRINT2 (" Other string: %p\n", FAILURE_STR (failure)); \
1598 } while (0)
1599
1600 /* Push the information about the state we will need
1601 if we ever fail back to it.
1602
1603 Requires variables fail_stack, regstart, regend and
1604 num_regs be declared. GROW_FAIL_STACK requires `destination' be
1605 declared.
1606
1607 Does `return FAILURE_CODE' if runs out of memory. */
1608
1609 #define PUSH_FAILURE_POINT(pattern, string_place) \
1610 do { \
1611 char *destination; \
1612 /* Must be int, so when we don't save any registers, the arithmetic \
1613 of 0 + -1 isn't done as unsigned. */ \
1614 \
1615 DEBUG_STATEMENT (nfailure_points_pushed++); \
1616 DEBUG_PRINT1 ("\nPUSH_FAILURE_POINT:\n"); \
1617 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail); \
1618 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
1619 \
1620 ENSURE_FAIL_STACK (NUM_NONREG_ITEMS); \
1621 \
1622 DEBUG_PRINT1 ("\n"); \
1623 \
1624 DEBUG_PRINT2 (" Push frame index: %d\n", fail_stack.frame); \
1625 PUSH_FAILURE_INT (fail_stack.frame); \
1626 \
1627 DEBUG_PRINT2 (" Push string %p: `", string_place); \
1628 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\
1629 DEBUG_PRINT1 ("'\n"); \
1630 PUSH_FAILURE_POINTER (string_place); \
1631 \
1632 DEBUG_PRINT2 (" Push pattern %p: ", pattern); \
1633 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern, pend); \
1634 PUSH_FAILURE_POINTER (pattern); \
1635 \
1636 /* Close the frame by moving the frame pointer past it. */ \
1637 fail_stack.frame = fail_stack.avail; \
1638 } while (0)
1639
1640 /* Estimate the size of data pushed by a typical failure stack entry.
1641 An estimate is all we need, because all we use this for
1642 is to choose a limit for how big to make the failure stack. */
1643 /* BEWARE, the value `20' is hard-coded in emacs.c:main(). */
1644 #define TYPICAL_FAILURE_SIZE 20
1645
1646 /* How many items can still be added to the stack without overflowing it. */
1647 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
1648
1649
1650 /* Pops what PUSH_FAIL_STACK pushes.
1651
1652 We restore into the parameters, all of which should be lvalues:
1653 STR -- the saved data position.
1654 PAT -- the saved pattern position.
1655 REGSTART, REGEND -- arrays of string positions.
1656
1657 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1658 `pend', `string1', `size1', `string2', and `size2'. */
1659
1660 #define POP_FAILURE_POINT(str, pat) \
1661 do { \
1662 assert (!FAIL_STACK_EMPTY ()); \
1663 \
1664 /* Remove failure points and point to how many regs pushed. */ \
1665 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1666 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
1667 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
1668 \
1669 /* Pop the saved registers. */ \
1670 while (fail_stack.frame < fail_stack.avail) \
1671 POP_FAILURE_REG_OR_COUNT (); \
1672 \
1673 pat = POP_FAILURE_POINTER (); \
1674 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1675 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
1676 \
1677 /* If the saved string location is NULL, it came from an \
1678 on_failure_keep_string_jump opcode, and we want to throw away the \
1679 saved NULL, thus retaining our current position in the string. */ \
1680 str = POP_FAILURE_POINTER (); \
1681 DEBUG_PRINT2 (" Popping string %p: `", str); \
1682 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
1683 DEBUG_PRINT1 ("'\n"); \
1684 \
1685 fail_stack.frame = POP_FAILURE_INT (); \
1686 DEBUG_PRINT2 (" Popping frame index: %d\n", fail_stack.frame); \
1687 \
1688 assert (fail_stack.avail >= 0); \
1689 assert (fail_stack.frame <= fail_stack.avail); \
1690 \
1691 DEBUG_STATEMENT (nfailure_points_popped++); \
1692 } while (0) /* POP_FAILURE_POINT */
1693
1694
1695 \f
1696 /* Registers are set to a sentinel when they haven't yet matched. */
1697 #define REG_UNSET(e) ((e) == NULL)
1698 \f
1699 /* Subroutine declarations and macros for regex_compile. */
1700
1701 static reg_errcode_t regex_compile _RE_ARGS ((re_char *pattern, size_t size,
1702 reg_syntax_t syntax,
1703 struct re_pattern_buffer *bufp));
1704 static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));
1705 static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1706 int arg1, int arg2));
1707 static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1708 int arg, unsigned char *end));
1709 static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,
1710 int arg1, int arg2, unsigned char *end));
1711 static boolean at_begline_loc_p _RE_ARGS ((re_char *pattern,
1712 re_char *p,
1713 reg_syntax_t syntax));
1714 static boolean at_endline_loc_p _RE_ARGS ((re_char *p,
1715 re_char *pend,
1716 reg_syntax_t syntax));
1717 static re_char *skip_one_char _RE_ARGS ((re_char *p));
1718 static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
1719 char *fastmap, const int multibyte));
1720
1721 /* Fetch the next character in the uncompiled pattern, with no
1722 translation. */
1723 #define PATFETCH(c) \
1724 do { \
1725 int len; \
1726 if (p == pend) return REG_EEND; \
1727 c = RE_STRING_CHAR_AND_LENGTH (p, len, multibyte); \
1728 p += len; \
1729 } while (0)
1730
1731
1732 /* If `translate' is non-null, return translate[D], else just D. We
1733 cast the subscript to translate because some data is declared as
1734 `char *', to avoid warnings when a string constant is passed. But
1735 when we use a character as a subscript we must make it unsigned. */
1736 #ifndef TRANSLATE
1737 # define TRANSLATE(d) \
1738 (RE_TRANSLATE_P (translate) ? RE_TRANSLATE (translate, (d)) : (d))
1739 #endif
1740
1741
1742 /* Macros for outputting the compiled pattern into `buffer'. */
1743
1744 /* If the buffer isn't allocated when it comes in, use this. */
1745 #define INIT_BUF_SIZE 32
1746
1747 /* Make sure we have at least N more bytes of space in buffer. */
1748 #define GET_BUFFER_SPACE(n) \
1749 while ((size_t) (b - bufp->buffer + (n)) > bufp->allocated) \
1750 EXTEND_BUFFER ()
1751
1752 /* Make sure we have one more byte of buffer space and then add C to it. */
1753 #define BUF_PUSH(c) \
1754 do { \
1755 GET_BUFFER_SPACE (1); \
1756 *b++ = (unsigned char) (c); \
1757 } while (0)
1758
1759
1760 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1761 #define BUF_PUSH_2(c1, c2) \
1762 do { \
1763 GET_BUFFER_SPACE (2); \
1764 *b++ = (unsigned char) (c1); \
1765 *b++ = (unsigned char) (c2); \
1766 } while (0)
1767
1768
1769 /* As with BUF_PUSH_2, except for three bytes. */
1770 #define BUF_PUSH_3(c1, c2, c3) \
1771 do { \
1772 GET_BUFFER_SPACE (3); \
1773 *b++ = (unsigned char) (c1); \
1774 *b++ = (unsigned char) (c2); \
1775 *b++ = (unsigned char) (c3); \
1776 } while (0)
1777
1778
1779 /* Store a jump with opcode OP at LOC to location TO. We store a
1780 relative address offset by the three bytes the jump itself occupies. */
1781 #define STORE_JUMP(op, loc, to) \
1782 store_op1 (op, loc, (to) - (loc) - 3)
1783
1784 /* Likewise, for a two-argument jump. */
1785 #define STORE_JUMP2(op, loc, to, arg) \
1786 store_op2 (op, loc, (to) - (loc) - 3, arg)
1787
1788 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
1789 #define INSERT_JUMP(op, loc, to) \
1790 insert_op1 (op, loc, (to) - (loc) - 3, b)
1791
1792 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
1793 #define INSERT_JUMP2(op, loc, to, arg) \
1794 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
1795
1796
1797 /* This is not an arbitrary limit: the arguments which represent offsets
1798 into the pattern are two bytes long. So if 2^15 bytes turns out to
1799 be too small, many things would have to change. */
1800 # define MAX_BUF_SIZE (1L << 15)
1801
1802 #if 0 /* This is when we thought it could be 2^16 bytes. */
1803 /* Any other compiler which, like MSC, has allocation limit below 2^16
1804 bytes will have to use approach similar to what was done below for
1805 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up
1806 reallocating to 0 bytes. Such thing is not going to work too well.
1807 You have been warned!! */
1808 #if defined _MSC_VER && !defined WIN32
1809 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. */
1810 # define MAX_BUF_SIZE 65500L
1811 #else
1812 # define MAX_BUF_SIZE (1L << 16)
1813 #endif
1814 #endif /* 0 */
1815
1816 /* Extend the buffer by twice its current size via realloc and
1817 reset the pointers that pointed into the old block to point to the
1818 correct places in the new one. If extending the buffer results in it
1819 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1820 #if __BOUNDED_POINTERS__
1821 # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated)
1822 # define MOVE_BUFFER_POINTER(P) \
1823 (__ptrlow (P) = new_buffer + (__ptrlow (P) - old_buffer), \
1824 SET_HIGH_BOUND (P), \
1825 __ptrvalue (P) = new_buffer + (__ptrvalue (P) - old_buffer))
1826 # define ELSE_EXTEND_BUFFER_HIGH_BOUND \
1827 else \
1828 { \
1829 SET_HIGH_BOUND (b); \
1830 SET_HIGH_BOUND (begalt); \
1831 if (fixup_alt_jump) \
1832 SET_HIGH_BOUND (fixup_alt_jump); \
1833 if (laststart) \
1834 SET_HIGH_BOUND (laststart); \
1835 if (pending_exact) \
1836 SET_HIGH_BOUND (pending_exact); \
1837 }
1838 #else
1839 # define MOVE_BUFFER_POINTER(P) ((P) = new_buffer + ((P) - old_buffer))
1840 # define ELSE_EXTEND_BUFFER_HIGH_BOUND
1841 #endif
1842 #define EXTEND_BUFFER() \
1843 do { \
1844 unsigned char *old_buffer = bufp->buffer; \
1845 if (bufp->allocated == MAX_BUF_SIZE) \
1846 return REG_ESIZE; \
1847 bufp->allocated <<= 1; \
1848 if (bufp->allocated > MAX_BUF_SIZE) \
1849 bufp->allocated = MAX_BUF_SIZE; \
1850 RETALLOC (bufp->buffer, bufp->allocated, unsigned char); \
1851 if (bufp->buffer == NULL) \
1852 return REG_ESPACE; \
1853 /* If the buffer moved, move all the pointers into it. */ \
1854 if (old_buffer != bufp->buffer) \
1855 { \
1856 unsigned char *new_buffer = bufp->buffer; \
1857 MOVE_BUFFER_POINTER (b); \
1858 MOVE_BUFFER_POINTER (begalt); \
1859 if (fixup_alt_jump) \
1860 MOVE_BUFFER_POINTER (fixup_alt_jump); \
1861 if (laststart) \
1862 MOVE_BUFFER_POINTER (laststart); \
1863 if (pending_exact) \
1864 MOVE_BUFFER_POINTER (pending_exact); \
1865 } \
1866 ELSE_EXTEND_BUFFER_HIGH_BOUND \
1867 } while (0)
1868
1869
1870 /* Since we have one byte reserved for the register number argument to
1871 {start,stop}_memory, the maximum number of groups we can report
1872 things about is what fits in that byte. */
1873 #define MAX_REGNUM 255
1874
1875 /* But patterns can have more than `MAX_REGNUM' registers. We just
1876 ignore the excess. */
1877 typedef int regnum_t;
1878
1879
1880 /* Macros for the compile stack. */
1881
1882 /* Since offsets can go either forwards or backwards, this type needs to
1883 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1884 /* int may be not enough when sizeof(int) == 2. */
1885 typedef long pattern_offset_t;
1886
1887 typedef struct
1888 {
1889 pattern_offset_t begalt_offset;
1890 pattern_offset_t fixup_alt_jump;
1891 pattern_offset_t laststart_offset;
1892 regnum_t regnum;
1893 } compile_stack_elt_t;
1894
1895
1896 typedef struct
1897 {
1898 compile_stack_elt_t *stack;
1899 unsigned size;
1900 unsigned avail; /* Offset of next open position. */
1901 } compile_stack_type;
1902
1903
1904 #define INIT_COMPILE_STACK_SIZE 32
1905
1906 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1907 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1908
1909 /* The next available element. */
1910 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1911
1912 /* Explicit quit checking is only used on NTemacs and whenever we
1913 use polling to process input events. */
1914 #if defined emacs && (defined WINDOWSNT || defined SYNC_INPUT) && defined QUIT
1915 extern int immediate_quit;
1916 # define IMMEDIATE_QUIT_CHECK \
1917 do { \
1918 if (immediate_quit) QUIT; \
1919 } while (0)
1920 #else
1921 # define IMMEDIATE_QUIT_CHECK ((void)0)
1922 #endif
1923 \f
1924 /* Structure to manage work area for range table. */
1925 struct range_table_work_area
1926 {
1927 int *table; /* actual work area. */
1928 int allocated; /* allocated size for work area in bytes. */
1929 int used; /* actually used size in words. */
1930 int bits; /* flag to record character classes */
1931 };
1932
1933 /* Make sure that WORK_AREA can hold more N multibyte characters.
1934 This is used only in set_image_of_range and set_image_of_range_1.
1935 It expects WORK_AREA to be a pointer.
1936 If it can't get the space, it returns from the surrounding function. */
1937
1938 #define EXTEND_RANGE_TABLE(work_area, n) \
1939 do { \
1940 if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \
1941 { \
1942 extend_range_table_work_area (&work_area); \
1943 if ((work_area).table == 0) \
1944 return (REG_ESPACE); \
1945 } \
1946 } while (0)
1947
1948 #define SET_RANGE_TABLE_WORK_AREA_BIT(work_area, bit) \
1949 (work_area).bits |= (bit)
1950
1951 /* Bits used to implement the multibyte-part of the various character classes
1952 such as [:alnum:] in a charset's range table. */
1953 #define BIT_WORD 0x1
1954 #define BIT_LOWER 0x2
1955 #define BIT_PUNCT 0x4
1956 #define BIT_SPACE 0x8
1957 #define BIT_UPPER 0x10
1958 #define BIT_MULTIBYTE 0x20
1959
1960 /* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */
1961 #define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \
1962 do { \
1963 EXTEND_RANGE_TABLE ((work_area), 2); \
1964 (work_area).table[(work_area).used++] = (range_start); \
1965 (work_area).table[(work_area).used++] = (range_end); \
1966 } while (0)
1967
1968 /* Free allocated memory for WORK_AREA. */
1969 #define FREE_RANGE_TABLE_WORK_AREA(work_area) \
1970 do { \
1971 if ((work_area).table) \
1972 free ((work_area).table); \
1973 } while (0)
1974
1975 #define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0, (work_area).bits = 0)
1976 #define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)
1977 #define RANGE_TABLE_WORK_BITS(work_area) ((work_area).bits)
1978 #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
1979 \f
1980
1981 /* Set the bit for character C in a list. */
1982 #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH))
1983
1984
1985 #ifdef emacs
1986
1987 /* Store characters in the range FROM to TO in the bitmap at B (for
1988 ASCII and unibyte characters) and WORK_AREA (for multibyte
1989 characters) while translating them and paying attention to the
1990 continuity of translated characters.
1991
1992 Implementation note: It is better to implement these fairly big
1993 macros by a function, but it's not that easy because macros called
1994 in this macro assume various local variables already declared. */
1995
1996 /* Both FROM and TO are ASCII characters. */
1997
1998 #define SETUP_ASCII_RANGE(work_area, FROM, TO) \
1999 do { \
2000 int C0, C1; \
2001 \
2002 for (C0 = (FROM); C0 <= (TO); C0++) \
2003 { \
2004 C1 = TRANSLATE (C0); \
2005 if (! ASCII_CHAR_P (C1)) \
2006 { \
2007 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2008 if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \
2009 C1 = C0; \
2010 } \
2011 SET_LIST_BIT (C1); \
2012 } \
2013 } while (0)
2014
2015
2016 /* Both FROM and TO are unibyte characters (0x80..0xFF). */
2017
2018 #define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \
2019 do { \
2020 int C0, C1, C2, I; \
2021 int USED = RANGE_TABLE_WORK_USED (work_area); \
2022 \
2023 for (C0 = (FROM); C0 <= (TO); C0++) \
2024 { \
2025 C1 = RE_CHAR_TO_MULTIBYTE (C0); \
2026 if (CHAR_BYTE8_P (C1)) \
2027 SET_LIST_BIT (C0); \
2028 else \
2029 { \
2030 C2 = TRANSLATE (C1); \
2031 if (C2 == C1 \
2032 || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \
2033 C1 = C0; \
2034 SET_LIST_BIT (C1); \
2035 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2036 { \
2037 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2038 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2039 \
2040 if (C2 >= from - 1 && C2 <= to + 1) \
2041 { \
2042 if (C2 == from - 1) \
2043 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2044 else if (C2 == to + 1) \
2045 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2046 break; \
2047 } \
2048 } \
2049 if (I < USED) \
2050 SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \
2051 } \
2052 } \
2053 } while (0)
2054
2055
2056 /* Both FROM and TO are multibyte characters. */
2057
2058 #define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \
2059 do { \
2060 int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \
2061 \
2062 SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \
2063 for (C0 = (FROM); C0 <= (TO); C0++) \
2064 { \
2065 C1 = TRANSLATE (C0); \
2066 if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \
2067 || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \
2068 SET_LIST_BIT (C2); \
2069 if (C1 >= (FROM) && C1 <= (TO)) \
2070 continue; \
2071 for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \
2072 { \
2073 int from = RANGE_TABLE_WORK_ELT (work_area, I); \
2074 int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \
2075 \
2076 if (C1 >= from - 1 && C1 <= to + 1) \
2077 { \
2078 if (C1 == from - 1) \
2079 RANGE_TABLE_WORK_ELT (work_area, I)--; \
2080 else if (C1 == to + 1) \
2081 RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \
2082 break; \
2083 } \
2084 } \
2085 if (I < USED) \
2086 SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \
2087 } \
2088 } while (0)
2089
2090 #endif /* emacs */
2091
2092 /* Get the next unsigned number in the uncompiled pattern. */
2093 #define GET_UNSIGNED_NUMBER(num) \
2094 do { \
2095 if (p == pend) \
2096 FREE_STACK_RETURN (REG_EBRACE); \
2097 else \
2098 { \
2099 PATFETCH (c); \
2100 while ('0' <= c && c <= '9') \
2101 { \
2102 int prev; \
2103 if (num < 0) \
2104 num = 0; \
2105 prev = num; \
2106 num = num * 10 + c - '0'; \
2107 if (num / 10 != prev) \
2108 FREE_STACK_RETURN (REG_BADBR); \
2109 if (p == pend) \
2110 FREE_STACK_RETURN (REG_EBRACE); \
2111 PATFETCH (c); \
2112 } \
2113 } \
2114 } while (0)
2115 \f
2116 #if ! WIDE_CHAR_SUPPORT
2117
2118 /* Map a string to the char class it names (if any). */
2119 re_wctype_t
2120 re_wctype (const re_char *str)
2121 {
2122 const char *string = str;
2123 if (STREQ (string, "alnum")) return RECC_ALNUM;
2124 else if (STREQ (string, "alpha")) return RECC_ALPHA;
2125 else if (STREQ (string, "word")) return RECC_WORD;
2126 else if (STREQ (string, "ascii")) return RECC_ASCII;
2127 else if (STREQ (string, "nonascii")) return RECC_NONASCII;
2128 else if (STREQ (string, "graph")) return RECC_GRAPH;
2129 else if (STREQ (string, "lower")) return RECC_LOWER;
2130 else if (STREQ (string, "print")) return RECC_PRINT;
2131 else if (STREQ (string, "punct")) return RECC_PUNCT;
2132 else if (STREQ (string, "space")) return RECC_SPACE;
2133 else if (STREQ (string, "upper")) return RECC_UPPER;
2134 else if (STREQ (string, "unibyte")) return RECC_UNIBYTE;
2135 else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE;
2136 else if (STREQ (string, "digit")) return RECC_DIGIT;
2137 else if (STREQ (string, "xdigit")) return RECC_XDIGIT;
2138 else if (STREQ (string, "cntrl")) return RECC_CNTRL;
2139 else if (STREQ (string, "blank")) return RECC_BLANK;
2140 else return 0;
2141 }
2142
2143 /* True if CH is in the char class CC. */
2144 boolean
2145 re_iswctype (int ch, re_wctype_t cc)
2146 {
2147 switch (cc)
2148 {
2149 case RECC_ALNUM: return ISALNUM (ch);
2150 case RECC_ALPHA: return ISALPHA (ch);
2151 case RECC_BLANK: return ISBLANK (ch);
2152 case RECC_CNTRL: return ISCNTRL (ch);
2153 case RECC_DIGIT: return ISDIGIT (ch);
2154 case RECC_GRAPH: return ISGRAPH (ch);
2155 case RECC_LOWER: return ISLOWER (ch);
2156 case RECC_PRINT: return ISPRINT (ch);
2157 case RECC_PUNCT: return ISPUNCT (ch);
2158 case RECC_SPACE: return ISSPACE (ch);
2159 case RECC_UPPER: return ISUPPER (ch);
2160 case RECC_XDIGIT: return ISXDIGIT (ch);
2161 case RECC_ASCII: return IS_REAL_ASCII (ch);
2162 case RECC_NONASCII: return !IS_REAL_ASCII (ch);
2163 case RECC_UNIBYTE: return ISUNIBYTE (ch);
2164 case RECC_MULTIBYTE: return !ISUNIBYTE (ch);
2165 case RECC_WORD: return ISWORD (ch);
2166 case RECC_ERROR: return false;
2167 default:
2168 abort();
2169 }
2170 }
2171
2172 /* Return a bit-pattern to use in the range-table bits to match multibyte
2173 chars of class CC. */
2174 static int
2175 re_wctype_to_bit (re_wctype_t cc)
2176 {
2177 switch (cc)
2178 {
2179 case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
2180 case RECC_MULTIBYTE: return BIT_MULTIBYTE;
2181 case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD;
2182 case RECC_LOWER: return BIT_LOWER;
2183 case RECC_UPPER: return BIT_UPPER;
2184 case RECC_PUNCT: return BIT_PUNCT;
2185 case RECC_SPACE: return BIT_SPACE;
2186 case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
2187 case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
2188 default:
2189 abort();
2190 }
2191 }
2192 #endif
2193 \f
2194 /* Filling in the work area of a range. */
2195
2196 /* Actually extend the space in WORK_AREA. */
2197
2198 static void
2199 extend_range_table_work_area (struct range_table_work_area *work_area)
2200 {
2201 work_area->allocated += 16 * sizeof (int);
2202 if (work_area->table)
2203 work_area->table
2204 = (int *) realloc (work_area->table, work_area->allocated);
2205 else
2206 work_area->table
2207 = (int *) malloc (work_area->allocated);
2208 }
2209
2210 #if 0
2211 #ifdef emacs
2212
2213 /* Carefully find the ranges of codes that are equivalent
2214 under case conversion to the range start..end when passed through
2215 TRANSLATE. Handle the case where non-letters can come in between
2216 two upper-case letters (which happens in Latin-1).
2217 Also handle the case of groups of more than 2 case-equivalent chars.
2218
2219 The basic method is to look at consecutive characters and see
2220 if they can form a run that can be handled as one.
2221
2222 Returns -1 if successful, REG_ESPACE if ran out of space. */
2223
2224 static int
2225 set_image_of_range_1 (work_area, start, end, translate)
2226 RE_TRANSLATE_TYPE translate;
2227 struct range_table_work_area *work_area;
2228 re_wchar_t start, end;
2229 {
2230 /* `one_case' indicates a character, or a run of characters,
2231 each of which is an isolate (no case-equivalents).
2232 This includes all ASCII non-letters.
2233
2234 `two_case' indicates a character, or a run of characters,
2235 each of which has two case-equivalent forms.
2236 This includes all ASCII letters.
2237
2238 `strange' indicates a character that has more than one
2239 case-equivalent. */
2240
2241 enum case_type {one_case, two_case, strange};
2242
2243 /* Describe the run that is in progress,
2244 which the next character can try to extend.
2245 If run_type is strange, that means there really is no run.
2246 If run_type is one_case, then run_start...run_end is the run.
2247 If run_type is two_case, then the run is run_start...run_end,
2248 and the case-equivalents end at run_eqv_end. */
2249
2250 enum case_type run_type = strange;
2251 int run_start, run_end, run_eqv_end;
2252
2253 Lisp_Object eqv_table;
2254
2255 if (!RE_TRANSLATE_P (translate))
2256 {
2257 EXTEND_RANGE_TABLE (work_area, 2);
2258 work_area->table[work_area->used++] = (start);
2259 work_area->table[work_area->used++] = (end);
2260 return -1;
2261 }
2262
2263 eqv_table = XCHAR_TABLE (translate)->extras[2];
2264
2265 for (; start <= end; start++)
2266 {
2267 enum case_type this_type;
2268 int eqv = RE_TRANSLATE (eqv_table, start);
2269 int minchar, maxchar;
2270
2271 /* Classify this character */
2272 if (eqv == start)
2273 this_type = one_case;
2274 else if (RE_TRANSLATE (eqv_table, eqv) == start)
2275 this_type = two_case;
2276 else
2277 this_type = strange;
2278
2279 if (start < eqv)
2280 minchar = start, maxchar = eqv;
2281 else
2282 minchar = eqv, maxchar = start;
2283
2284 /* Can this character extend the run in progress? */
2285 if (this_type == strange || this_type != run_type
2286 || !(minchar == run_end + 1
2287 && (run_type == two_case
2288 ? maxchar == run_eqv_end + 1 : 1)))
2289 {
2290 /* No, end the run.
2291 Record each of its equivalent ranges. */
2292 if (run_type == one_case)
2293 {
2294 EXTEND_RANGE_TABLE (work_area, 2);
2295 work_area->table[work_area->used++] = run_start;
2296 work_area->table[work_area->used++] = run_end;
2297 }
2298 else if (run_type == two_case)
2299 {
2300 EXTEND_RANGE_TABLE (work_area, 4);
2301 work_area->table[work_area->used++] = run_start;
2302 work_area->table[work_area->used++] = run_end;
2303 work_area->table[work_area->used++]
2304 = RE_TRANSLATE (eqv_table, run_start);
2305 work_area->table[work_area->used++]
2306 = RE_TRANSLATE (eqv_table, run_end);
2307 }
2308 run_type = strange;
2309 }
2310
2311 if (this_type == strange)
2312 {
2313 /* For a strange character, add each of its equivalents, one
2314 by one. Don't start a range. */
2315 do
2316 {
2317 EXTEND_RANGE_TABLE (work_area, 2);
2318 work_area->table[work_area->used++] = eqv;
2319 work_area->table[work_area->used++] = eqv;
2320 eqv = RE_TRANSLATE (eqv_table, eqv);
2321 }
2322 while (eqv != start);
2323 }
2324
2325 /* Add this char to the run, or start a new run. */
2326 else if (run_type == strange)
2327 {
2328 /* Initialize a new range. */
2329 run_type = this_type;
2330 run_start = start;
2331 run_end = start;
2332 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2333 }
2334 else
2335 {
2336 /* Extend a running range. */
2337 run_end = minchar;
2338 run_eqv_end = RE_TRANSLATE (eqv_table, run_end);
2339 }
2340 }
2341
2342 /* If a run is still in progress at the end, finish it now
2343 by recording its equivalent ranges. */
2344 if (run_type == one_case)
2345 {
2346 EXTEND_RANGE_TABLE (work_area, 2);
2347 work_area->table[work_area->used++] = run_start;
2348 work_area->table[work_area->used++] = run_end;
2349 }
2350 else if (run_type == two_case)
2351 {
2352 EXTEND_RANGE_TABLE (work_area, 4);
2353 work_area->table[work_area->used++] = run_start;
2354 work_area->table[work_area->used++] = run_end;
2355 work_area->table[work_area->used++]
2356 = RE_TRANSLATE (eqv_table, run_start);
2357 work_area->table[work_area->used++]
2358 = RE_TRANSLATE (eqv_table, run_end);
2359 }
2360
2361 return -1;
2362 }
2363
2364 #endif /* emacs */
2365
2366 /* Record the image of the range start..end when passed through
2367 TRANSLATE. This is not necessarily TRANSLATE(start)..TRANSLATE(end)
2368 and is not even necessarily contiguous.
2369 Normally we approximate it with the smallest contiguous range that contains
2370 all the chars we need. However, for Latin-1 we go to extra effort
2371 to do a better job.
2372
2373 This function is not called for ASCII ranges.
2374
2375 Returns -1 if successful, REG_ESPACE if ran out of space. */
2376
2377 static int
2378 set_image_of_range (work_area, start, end, translate)
2379 RE_TRANSLATE_TYPE translate;
2380 struct range_table_work_area *work_area;
2381 re_wchar_t start, end;
2382 {
2383 re_wchar_t cmin, cmax;
2384
2385 #ifdef emacs
2386 /* For Latin-1 ranges, use set_image_of_range_1
2387 to get proper handling of ranges that include letters and nonletters.
2388 For a range that includes the whole of Latin-1, this is not necessary.
2389 For other character sets, we don't bother to get this right. */
2390 if (RE_TRANSLATE_P (translate) && start < 04400
2391 && !(start < 04200 && end >= 04377))
2392 {
2393 int newend;
2394 int tem;
2395 newend = end;
2396 if (newend > 04377)
2397 newend = 04377;
2398 tem = set_image_of_range_1 (work_area, start, newend, translate);
2399 if (tem > 0)
2400 return tem;
2401
2402 start = 04400;
2403 if (end < 04400)
2404 return -1;
2405 }
2406 #endif
2407
2408 EXTEND_RANGE_TABLE (work_area, 2);
2409 work_area->table[work_area->used++] = (start);
2410 work_area->table[work_area->used++] = (end);
2411
2412 cmin = -1, cmax = -1;
2413
2414 if (RE_TRANSLATE_P (translate))
2415 {
2416 int ch;
2417
2418 for (ch = start; ch <= end; ch++)
2419 {
2420 re_wchar_t c = TRANSLATE (ch);
2421 if (! (start <= c && c <= end))
2422 {
2423 if (cmin == -1)
2424 cmin = c, cmax = c;
2425 else
2426 {
2427 cmin = MIN (cmin, c);
2428 cmax = MAX (cmax, c);
2429 }
2430 }
2431 }
2432
2433 if (cmin != -1)
2434 {
2435 EXTEND_RANGE_TABLE (work_area, 2);
2436 work_area->table[work_area->used++] = (cmin);
2437 work_area->table[work_area->used++] = (cmax);
2438 }
2439 }
2440
2441 return -1;
2442 }
2443 #endif /* 0 */
2444 \f
2445 #ifndef MATCH_MAY_ALLOCATE
2446
2447 /* If we cannot allocate large objects within re_match_2_internal,
2448 we make the fail stack and register vectors global.
2449 The fail stack, we grow to the maximum size when a regexp
2450 is compiled.
2451 The register vectors, we adjust in size each time we
2452 compile a regexp, according to the number of registers it needs. */
2453
2454 static fail_stack_type fail_stack;
2455
2456 /* Size with which the following vectors are currently allocated.
2457 That is so we can make them bigger as needed,
2458 but never make them smaller. */
2459 static int regs_allocated_size;
2460
2461 static re_char ** regstart, ** regend;
2462 static re_char **best_regstart, **best_regend;
2463
2464 /* Make the register vectors big enough for NUM_REGS registers,
2465 but don't make them smaller. */
2466
2467 static
2468 regex_grow_registers (num_regs)
2469 int num_regs;
2470 {
2471 if (num_regs > regs_allocated_size)
2472 {
2473 RETALLOC_IF (regstart, num_regs, re_char *);
2474 RETALLOC_IF (regend, num_regs, re_char *);
2475 RETALLOC_IF (best_regstart, num_regs, re_char *);
2476 RETALLOC_IF (best_regend, num_regs, re_char *);
2477
2478 regs_allocated_size = num_regs;
2479 }
2480 }
2481
2482 #endif /* not MATCH_MAY_ALLOCATE */
2483 \f
2484 static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
2485 compile_stack,
2486 regnum_t regnum));
2487
2488 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2489 Returns one of error codes defined in `regex.h', or zero for success.
2490
2491 Assumes the `allocated' (and perhaps `buffer') and `translate'
2492 fields are set in BUFP on entry.
2493
2494 If it succeeds, results are put in BUFP (if it returns an error, the
2495 contents of BUFP are undefined):
2496 `buffer' is the compiled pattern;
2497 `syntax' is set to SYNTAX;
2498 `used' is set to the length of the compiled pattern;
2499 `fastmap_accurate' is zero;
2500 `re_nsub' is the number of subexpressions in PATTERN;
2501 `not_bol' and `not_eol' are zero;
2502
2503 The `fastmap' field is neither examined nor set. */
2504
2505 /* Insert the `jump' from the end of last alternative to "here".
2506 The space for the jump has already been allocated. */
2507 #define FIXUP_ALT_JUMP() \
2508 do { \
2509 if (fixup_alt_jump) \
2510 STORE_JUMP (jump, fixup_alt_jump, b); \
2511 } while (0)
2512
2513
2514 /* Return, freeing storage we allocated. */
2515 #define FREE_STACK_RETURN(value) \
2516 do { \
2517 FREE_RANGE_TABLE_WORK_AREA (range_table_work); \
2518 free (compile_stack.stack); \
2519 return value; \
2520 } while (0)
2521
2522 static reg_errcode_t
2523 regex_compile (const re_char *pattern, size_t size, reg_syntax_t syntax, struct re_pattern_buffer *bufp)
2524 {
2525 /* We fetch characters from PATTERN here. */
2526 register re_wchar_t c, c1;
2527
2528 /* A random temporary spot in PATTERN. */
2529 re_char *p1;
2530
2531 /* Points to the end of the buffer, where we should append. */
2532 register unsigned char *b;
2533
2534 /* Keeps track of unclosed groups. */
2535 compile_stack_type compile_stack;
2536
2537 /* Points to the current (ending) position in the pattern. */
2538 #ifdef AIX
2539 /* `const' makes AIX compiler fail. */
2540 unsigned char *p = pattern;
2541 #else
2542 re_char *p = pattern;
2543 #endif
2544 re_char *pend = pattern + size;
2545
2546 /* How to translate the characters in the pattern. */
2547 RE_TRANSLATE_TYPE translate = bufp->translate;
2548
2549 /* Address of the count-byte of the most recently inserted `exactn'
2550 command. This makes it possible to tell if a new exact-match
2551 character can be added to that command or if the character requires
2552 a new `exactn' command. */
2553 unsigned char *pending_exact = 0;
2554
2555 /* Address of start of the most recently finished expression.
2556 This tells, e.g., postfix * where to find the start of its
2557 operand. Reset at the beginning of groups and alternatives. */
2558 unsigned char *laststart = 0;
2559
2560 /* Address of beginning of regexp, or inside of last group. */
2561 unsigned char *begalt;
2562
2563 /* Place in the uncompiled pattern (i.e., the {) to
2564 which to go back if the interval is invalid. */
2565 re_char *beg_interval;
2566
2567 /* Address of the place where a forward jump should go to the end of
2568 the containing expression. Each alternative of an `or' -- except the
2569 last -- ends with a forward jump of this sort. */
2570 unsigned char *fixup_alt_jump = 0;
2571
2572 /* Work area for range table of charset. */
2573 struct range_table_work_area range_table_work;
2574
2575 /* If the object matched can contain multibyte characters. */
2576 const boolean multibyte = RE_MULTIBYTE_P (bufp);
2577
2578 /* If a target of matching can contain multibyte characters. */
2579 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
2580
2581 /* Nonzero if we have pushed down into a subpattern. */
2582 int in_subpattern = 0;
2583
2584 /* These hold the values of p, pattern, and pend from the main
2585 pattern when we have pushed into a subpattern. */
2586 re_char *main_p;
2587 re_char *main_pattern;
2588 re_char *main_pend;
2589
2590 #ifdef DEBUG
2591 debug++;
2592 DEBUG_PRINT1 ("\nCompiling pattern: ");
2593 if (debug > 0)
2594 {
2595 unsigned debug_count;
2596
2597 for (debug_count = 0; debug_count < size; debug_count++)
2598 putchar (pattern[debug_count]);
2599 putchar ('\n');
2600 }
2601 #endif /* DEBUG */
2602
2603 /* Initialize the compile stack. */
2604 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2605 if (compile_stack.stack == NULL)
2606 return REG_ESPACE;
2607
2608 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2609 compile_stack.avail = 0;
2610
2611 range_table_work.table = 0;
2612 range_table_work.allocated = 0;
2613
2614 /* Initialize the pattern buffer. */
2615 bufp->syntax = syntax;
2616 bufp->fastmap_accurate = 0;
2617 bufp->not_bol = bufp->not_eol = 0;
2618 bufp->used_syntax = 0;
2619
2620 /* Set `used' to zero, so that if we return an error, the pattern
2621 printer (for debugging) will think there's no pattern. We reset it
2622 at the end. */
2623 bufp->used = 0;
2624
2625 /* Always count groups, whether or not bufp->no_sub is set. */
2626 bufp->re_nsub = 0;
2627
2628 #if !defined emacs && !defined SYNTAX_TABLE
2629 /* Initialize the syntax table. */
2630 init_syntax_once ();
2631 #endif
2632
2633 if (bufp->allocated == 0)
2634 {
2635 if (bufp->buffer)
2636 { /* If zero allocated, but buffer is non-null, try to realloc
2637 enough space. This loses if buffer's address is bogus, but
2638 that is the user's responsibility. */
2639 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
2640 }
2641 else
2642 { /* Caller did not allocate a buffer. Do it for them. */
2643 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
2644 }
2645 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);
2646
2647 bufp->allocated = INIT_BUF_SIZE;
2648 }
2649
2650 begalt = b = bufp->buffer;
2651
2652 /* Loop through the uncompiled pattern until we're at the end. */
2653 while (1)
2654 {
2655 if (p == pend)
2656 {
2657 /* If this is the end of an included regexp,
2658 pop back to the main regexp and try again. */
2659 if (in_subpattern)
2660 {
2661 in_subpattern = 0;
2662 pattern = main_pattern;
2663 p = main_p;
2664 pend = main_pend;
2665 continue;
2666 }
2667 /* If this is the end of the main regexp, we are done. */
2668 break;
2669 }
2670
2671 PATFETCH (c);
2672
2673 switch (c)
2674 {
2675 case ' ':
2676 {
2677 re_char *p1 = p;
2678
2679 /* If there's no special whitespace regexp, treat
2680 spaces normally. And don't try to do this recursively. */
2681 if (!whitespace_regexp || in_subpattern)
2682 goto normal_char;
2683
2684 /* Peek past following spaces. */
2685 while (p1 != pend)
2686 {
2687 if (*p1 != ' ')
2688 break;
2689 p1++;
2690 }
2691 /* If the spaces are followed by a repetition op,
2692 treat them normally. */
2693 if (p1 != pend
2694 && (*p1 == '*' || *p1 == '+' || *p1 == '?'
2695 || (*p1 == '\\' && p1 + 1 != pend && p1[1] == '{')))
2696 goto normal_char;
2697
2698 /* Replace the spaces with the whitespace regexp. */
2699 in_subpattern = 1;
2700 main_p = p1;
2701 main_pend = pend;
2702 main_pattern = pattern;
2703 p = pattern = whitespace_regexp;
2704 pend = p + strlen (p);
2705 break;
2706 }
2707
2708 case '^':
2709 {
2710 if ( /* If at start of pattern, it's an operator. */
2711 p == pattern + 1
2712 /* If context independent, it's an operator. */
2713 || syntax & RE_CONTEXT_INDEP_ANCHORS
2714 /* Otherwise, depends on what's come before. */
2715 || at_begline_loc_p (pattern, p, syntax))
2716 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);
2717 else
2718 goto normal_char;
2719 }
2720 break;
2721
2722
2723 case '$':
2724 {
2725 if ( /* If at end of pattern, it's an operator. */
2726 p == pend
2727 /* If context independent, it's an operator. */
2728 || syntax & RE_CONTEXT_INDEP_ANCHORS
2729 /* Otherwise, depends on what's next. */
2730 || at_endline_loc_p (p, pend, syntax))
2731 BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);
2732 else
2733 goto normal_char;
2734 }
2735 break;
2736
2737
2738 case '+':
2739 case '?':
2740 if ((syntax & RE_BK_PLUS_QM)
2741 || (syntax & RE_LIMITED_OPS))
2742 goto normal_char;
2743 handle_plus:
2744 case '*':
2745 /* If there is no previous pattern... */
2746 if (!laststart)
2747 {
2748 if (syntax & RE_CONTEXT_INVALID_OPS)
2749 FREE_STACK_RETURN (REG_BADRPT);
2750 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
2751 goto normal_char;
2752 }
2753
2754 {
2755 /* 1 means zero (many) matches is allowed. */
2756 boolean zero_times_ok = 0, many_times_ok = 0;
2757 boolean greedy = 1;
2758
2759 /* If there is a sequence of repetition chars, collapse it
2760 down to just one (the right one). We can't combine
2761 interval operators with these because of, e.g., `a{2}*',
2762 which should only match an even number of `a's. */
2763
2764 for (;;)
2765 {
2766 if ((syntax & RE_FRUGAL)
2767 && c == '?' && (zero_times_ok || many_times_ok))
2768 greedy = 0;
2769 else
2770 {
2771 zero_times_ok |= c != '+';
2772 many_times_ok |= c != '?';
2773 }
2774
2775 if (p == pend)
2776 break;
2777 else if (*p == '*'
2778 || (!(syntax & RE_BK_PLUS_QM)
2779 && (*p == '+' || *p == '?')))
2780 ;
2781 else if (syntax & RE_BK_PLUS_QM && *p == '\\')
2782 {
2783 if (p+1 == pend)
2784 FREE_STACK_RETURN (REG_EESCAPE);
2785 if (p[1] == '+' || p[1] == '?')
2786 PATFETCH (c); /* Gobble up the backslash. */
2787 else
2788 break;
2789 }
2790 else
2791 break;
2792 /* If we get here, we found another repeat character. */
2793 PATFETCH (c);
2794 }
2795
2796 /* Star, etc. applied to an empty pattern is equivalent
2797 to an empty pattern. */
2798 if (!laststart || laststart == b)
2799 break;
2800
2801 /* Now we know whether or not zero matches is allowed
2802 and also whether or not two or more matches is allowed. */
2803 if (greedy)
2804 {
2805 if (many_times_ok)
2806 {
2807 boolean simple = skip_one_char (laststart) == b;
2808 unsigned int startoffset = 0;
2809 re_opcode_t ofj =
2810 /* Check if the loop can match the empty string. */
2811 (simple || !analyse_first (laststart, b, NULL, 0))
2812 ? on_failure_jump : on_failure_jump_loop;
2813 assert (skip_one_char (laststart) <= b);
2814
2815 if (!zero_times_ok && simple)
2816 { /* Since simple * loops can be made faster by using
2817 on_failure_keep_string_jump, we turn simple P+
2818 into PP* if P is simple. */
2819 unsigned char *p1, *p2;
2820 startoffset = b - laststart;
2821 GET_BUFFER_SPACE (startoffset);
2822 p1 = b; p2 = laststart;
2823 while (p2 < p1)
2824 *b++ = *p2++;
2825 zero_times_ok = 1;
2826 }
2827
2828 GET_BUFFER_SPACE (6);
2829 if (!zero_times_ok)
2830 /* A + loop. */
2831 STORE_JUMP (ofj, b, b + 6);
2832 else
2833 /* Simple * loops can use on_failure_keep_string_jump
2834 depending on what follows. But since we don't know
2835 that yet, we leave the decision up to
2836 on_failure_jump_smart. */
2837 INSERT_JUMP (simple ? on_failure_jump_smart : ofj,
2838 laststart + startoffset, b + 6);
2839 b += 3;
2840 STORE_JUMP (jump, b, laststart + startoffset);
2841 b += 3;
2842 }
2843 else
2844 {
2845 /* A simple ? pattern. */
2846 assert (zero_times_ok);
2847 GET_BUFFER_SPACE (3);
2848 INSERT_JUMP (on_failure_jump, laststart, b + 3);
2849 b += 3;
2850 }
2851 }
2852 else /* not greedy */
2853 { /* I wish the greedy and non-greedy cases could be merged. */
2854
2855 GET_BUFFER_SPACE (7); /* We might use less. */
2856 if (many_times_ok)
2857 {
2858 boolean emptyp = analyse_first (laststart, b, NULL, 0);
2859
2860 /* The non-greedy multiple match looks like
2861 a repeat..until: we only need a conditional jump
2862 at the end of the loop. */
2863 if (emptyp) BUF_PUSH (no_op);
2864 STORE_JUMP (emptyp ? on_failure_jump_nastyloop
2865 : on_failure_jump, b, laststart);
2866 b += 3;
2867 if (zero_times_ok)
2868 {
2869 /* The repeat...until naturally matches one or more.
2870 To also match zero times, we need to first jump to
2871 the end of the loop (its conditional jump). */
2872 INSERT_JUMP (jump, laststart, b);
2873 b += 3;
2874 }
2875 }
2876 else
2877 {
2878 /* non-greedy a?? */
2879 INSERT_JUMP (jump, laststart, b + 3);
2880 b += 3;
2881 INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
2882 b += 3;
2883 }
2884 }
2885 }
2886 pending_exact = 0;
2887 break;
2888
2889
2890 case '.':
2891 laststart = b;
2892 BUF_PUSH (anychar);
2893 break;
2894
2895
2896 case '[':
2897 {
2898 CLEAR_RANGE_TABLE_WORK_USED (range_table_work);
2899
2900 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2901
2902 /* Ensure that we have enough space to push a charset: the
2903 opcode, the length count, and the bitset; 34 bytes in all. */
2904 GET_BUFFER_SPACE (34);
2905
2906 laststart = b;
2907
2908 /* We test `*p == '^' twice, instead of using an if
2909 statement, so we only need one BUF_PUSH. */
2910 BUF_PUSH (*p == '^' ? charset_not : charset);
2911 if (*p == '^')
2912 p++;
2913
2914 /* Remember the first position in the bracket expression. */
2915 p1 = p;
2916
2917 /* Push the number of bytes in the bitmap. */
2918 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
2919
2920 /* Clear the whole map. */
2921 memset (b, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2922
2923 /* charset_not matches newline according to a syntax bit. */
2924 if ((re_opcode_t) b[-2] == charset_not
2925 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2926 SET_LIST_BIT ('\n');
2927
2928 /* Read in characters and ranges, setting map bits. */
2929 for (;;)
2930 {
2931 boolean escaped_char = false;
2932 const unsigned char *p2 = p;
2933 re_wchar_t ch, c2;
2934
2935 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2936
2937 /* Don't translate yet. The range TRANSLATE(X..Y) cannot
2938 always be determined from TRANSLATE(X) and TRANSLATE(Y)
2939 So the translation is done later in a loop. Example:
2940 (let ((case-fold-search t)) (string-match "[A-_]" "A")) */
2941 PATFETCH (c);
2942
2943 /* \ might escape characters inside [...] and [^...]. */
2944 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
2945 {
2946 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
2947
2948 PATFETCH (c);
2949 escaped_char = true;
2950 }
2951 else
2952 {
2953 /* Could be the end of the bracket expression. If it's
2954 not (i.e., when the bracket expression is `[]' so
2955 far), the ']' character bit gets set way below. */
2956 if (c == ']' && p2 != p1)
2957 break;
2958 }
2959
2960 /* See if we're at the beginning of a possible character
2961 class. */
2962
2963 if (!escaped_char &&
2964 syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
2965 {
2966 /* Leave room for the null. */
2967 unsigned char str[CHAR_CLASS_MAX_LENGTH + 1];
2968 const unsigned char *class_beg;
2969
2970 PATFETCH (c);
2971 c1 = 0;
2972 class_beg = p;
2973
2974 /* If pattern is `[[:'. */
2975 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
2976
2977 for (;;)
2978 {
2979 PATFETCH (c);
2980 if ((c == ':' && *p == ']') || p == pend)
2981 break;
2982 if (c1 < CHAR_CLASS_MAX_LENGTH)
2983 str[c1++] = c;
2984 else
2985 /* This is in any case an invalid class name. */
2986 str[0] = '\0';
2987 }
2988 str[c1] = '\0';
2989
2990 /* If isn't a word bracketed by `[:' and `:]':
2991 undo the ending character, the letters, and
2992 leave the leading `:' and `[' (but set bits for
2993 them). */
2994 if (c == ':' && *p == ']')
2995 {
2996 re_wctype_t cc;
2997 int limit;
2998
2999 cc = re_wctype (str);
3000
3001 if (cc == 0)
3002 FREE_STACK_RETURN (REG_ECTYPE);
3003
3004 /* Throw away the ] at the end of the character
3005 class. */
3006 PATFETCH (c);
3007
3008 if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
3009
3010 #ifndef emacs
3011 for (ch = 0; ch < (1 << BYTEWIDTH); ++ch)
3012 if (re_iswctype (btowc (ch), cc))
3013 {
3014 c = TRANSLATE (ch);
3015 if (c < (1 << BYTEWIDTH))
3016 SET_LIST_BIT (c);
3017 }
3018 #else /* emacs */
3019 /* Most character classes in a multibyte match
3020 just set a flag. Exceptions are is_blank,
3021 is_digit, is_cntrl, and is_xdigit, since
3022 they can only match ASCII characters. We
3023 don't need to handle them for multibyte.
3024 They are distinguished by a negative wctype. */
3025
3026 /* Setup the gl_state object to its buffer-defined
3027 value. This hardcodes the buffer-global
3028 syntax-table for ASCII chars, while the other chars
3029 will obey syntax-table properties. It's not ideal,
3030 but it's the way it's been done until now. */
3031 SETUP_BUFFER_SYNTAX_TABLE ();
3032
3033 for (ch = 0; ch < 256; ++ch)
3034 {
3035 c = RE_CHAR_TO_MULTIBYTE (ch);
3036 if (! CHAR_BYTE8_P (c)
3037 && re_iswctype (c, cc))
3038 {
3039 SET_LIST_BIT (ch);
3040 c1 = TRANSLATE (c);
3041 if (c1 == c)
3042 continue;
3043 if (ASCII_CHAR_P (c1))
3044 SET_LIST_BIT (c1);
3045 else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0)
3046 SET_LIST_BIT (c1);
3047 }
3048 }
3049 SET_RANGE_TABLE_WORK_AREA_BIT
3050 (range_table_work, re_wctype_to_bit (cc));
3051 #endif /* emacs */
3052 /* In most cases the matching rule for char classes
3053 only uses the syntax table for multibyte chars,
3054 so that the content of the syntax-table it is not
3055 hardcoded in the range_table. SPACE and WORD are
3056 the two exceptions. */
3057 if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
3058 bufp->used_syntax = 1;
3059
3060 /* Repeat the loop. */
3061 continue;
3062 }
3063 else
3064 {
3065 /* Go back to right after the "[:". */
3066 p = class_beg;
3067 SET_LIST_BIT ('[');
3068
3069 /* Because the `:' may starts the range, we
3070 can't simply set bit and repeat the loop.
3071 Instead, just set it to C and handle below. */
3072 c = ':';
3073 }
3074 }
3075
3076 if (p < pend && p[0] == '-' && p[1] != ']')
3077 {
3078
3079 /* Discard the `-'. */
3080 PATFETCH (c1);
3081
3082 /* Fetch the character which ends the range. */
3083 PATFETCH (c1);
3084 #ifdef emacs
3085 if (CHAR_BYTE8_P (c1)
3086 && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c))
3087 /* Treat the range from a multibyte character to
3088 raw-byte character as empty. */
3089 c = c1 + 1;
3090 #endif /* emacs */
3091 }
3092 else
3093 /* Range from C to C. */
3094 c1 = c;
3095
3096 if (c > c1)
3097 {
3098 if (syntax & RE_NO_EMPTY_RANGES)
3099 FREE_STACK_RETURN (REG_ERANGEX);
3100 /* Else, repeat the loop. */
3101 }
3102 else
3103 {
3104 #ifndef emacs
3105 /* Set the range into bitmap */
3106 for (; c <= c1; c++)
3107 {
3108 ch = TRANSLATE (c);
3109 if (ch < (1 << BYTEWIDTH))
3110 SET_LIST_BIT (ch);
3111 }
3112 #else /* emacs */
3113 if (c < 128)
3114 {
3115 ch = MIN (127, c1);
3116 SETUP_ASCII_RANGE (range_table_work, c, ch);
3117 c = ch + 1;
3118 if (CHAR_BYTE8_P (c1))
3119 c = BYTE8_TO_CHAR (128);
3120 }
3121 if (c <= c1)
3122 {
3123 if (CHAR_BYTE8_P (c))
3124 {
3125 c = CHAR_TO_BYTE8 (c);
3126 c1 = CHAR_TO_BYTE8 (c1);
3127 for (; c <= c1; c++)
3128 SET_LIST_BIT (c);
3129 }
3130 else if (multibyte)
3131 {
3132 SETUP_MULTIBYTE_RANGE (range_table_work, c, c1);
3133 }
3134 else
3135 {
3136 SETUP_UNIBYTE_RANGE (range_table_work, c, c1);
3137 }
3138 }
3139 #endif /* emacs */
3140 }
3141 }
3142
3143 /* Discard any (non)matching list bytes that are all 0 at the
3144 end of the map. Decrease the map-length byte too. */
3145 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
3146 b[-1]--;
3147 b += b[-1];
3148
3149 /* Build real range table from work area. */
3150 if (RANGE_TABLE_WORK_USED (range_table_work)
3151 || RANGE_TABLE_WORK_BITS (range_table_work))
3152 {
3153 int i;
3154 int used = RANGE_TABLE_WORK_USED (range_table_work);
3155
3156 /* Allocate space for COUNT + RANGE_TABLE. Needs two
3157 bytes for flags, two for COUNT, and three bytes for
3158 each character. */
3159 GET_BUFFER_SPACE (4 + used * 3);
3160
3161 /* Indicate the existence of range table. */
3162 laststart[1] |= 0x80;
3163
3164 /* Store the character class flag bits into the range table.
3165 If not in emacs, these flag bits are always 0. */
3166 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) & 0xff;
3167 *b++ = RANGE_TABLE_WORK_BITS (range_table_work) >> 8;
3168
3169 STORE_NUMBER_AND_INCR (b, used / 2);
3170 for (i = 0; i < used; i++)
3171 STORE_CHARACTER_AND_INCR
3172 (b, RANGE_TABLE_WORK_ELT (range_table_work, i));
3173 }
3174 }
3175 break;
3176
3177
3178 case '(':
3179 if (syntax & RE_NO_BK_PARENS)
3180 goto handle_open;
3181 else
3182 goto normal_char;
3183
3184
3185 case ')':
3186 if (syntax & RE_NO_BK_PARENS)
3187 goto handle_close;
3188 else
3189 goto normal_char;
3190
3191
3192 case '\n':
3193 if (syntax & RE_NEWLINE_ALT)
3194 goto handle_alt;
3195 else
3196 goto normal_char;
3197
3198
3199 case '|':
3200 if (syntax & RE_NO_BK_VBAR)
3201 goto handle_alt;
3202 else
3203 goto normal_char;
3204
3205
3206 case '{':
3207 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
3208 goto handle_interval;
3209 else
3210 goto normal_char;
3211
3212
3213 case '\\':
3214 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
3215
3216 /* Do not translate the character after the \, so that we can
3217 distinguish, e.g., \B from \b, even if we normally would
3218 translate, e.g., B to b. */
3219 PATFETCH (c);
3220
3221 switch (c)
3222 {
3223 case '(':
3224 if (syntax & RE_NO_BK_PARENS)
3225 goto normal_backslash;
3226
3227 handle_open:
3228 {
3229 int shy = 0;
3230 regnum_t regnum = 0;
3231 if (p+1 < pend)
3232 {
3233 /* Look for a special (?...) construct */
3234 if ((syntax & RE_SHY_GROUPS) && *p == '?')
3235 {
3236 PATFETCH (c); /* Gobble up the '?'. */
3237 while (!shy)
3238 {
3239 PATFETCH (c);
3240 switch (c)
3241 {
3242 case ':': shy = 1; break;
3243 case '0':
3244 /* An explicitly specified regnum must start
3245 with non-0. */
3246 if (regnum == 0)
3247 FREE_STACK_RETURN (REG_BADPAT);
3248 case '1': case '2': case '3': case '4':
3249 case '5': case '6': case '7': case '8': case '9':
3250 regnum = 10*regnum + (c - '0'); break;
3251 default:
3252 /* Only (?:...) is supported right now. */
3253 FREE_STACK_RETURN (REG_BADPAT);
3254 }
3255 }
3256 }
3257 }
3258
3259 if (!shy)
3260 regnum = ++bufp->re_nsub;
3261 else if (regnum)
3262 { /* It's actually not shy, but explicitly numbered. */
3263 shy = 0;
3264 if (regnum > bufp->re_nsub)
3265 bufp->re_nsub = regnum;
3266 else if (regnum > bufp->re_nsub
3267 /* Ideally, we'd want to check that the specified
3268 group can't have matched (i.e. all subgroups
3269 using the same regnum are in other branches of
3270 OR patterns), but we don't currently keep track
3271 of enough info to do that easily. */
3272 || group_in_compile_stack (compile_stack, regnum))
3273 FREE_STACK_RETURN (REG_BADPAT);
3274 }
3275 else
3276 /* It's really shy. */
3277 regnum = - bufp->re_nsub;
3278
3279 if (COMPILE_STACK_FULL)
3280 {
3281 RETALLOC (compile_stack.stack, compile_stack.size << 1,
3282 compile_stack_elt_t);
3283 if (compile_stack.stack == NULL) return REG_ESPACE;
3284
3285 compile_stack.size <<= 1;
3286 }
3287
3288 /* These are the values to restore when we hit end of this
3289 group. They are all relative offsets, so that if the
3290 whole pattern moves because of realloc, they will still
3291 be valid. */
3292 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
3293 COMPILE_STACK_TOP.fixup_alt_jump
3294 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
3295 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
3296 COMPILE_STACK_TOP.regnum = regnum;
3297
3298 /* Do not push a start_memory for groups beyond the last one
3299 we can represent in the compiled pattern. */
3300 if (regnum <= MAX_REGNUM && regnum > 0)
3301 BUF_PUSH_2 (start_memory, regnum);
3302
3303 compile_stack.avail++;
3304
3305 fixup_alt_jump = 0;
3306 laststart = 0;
3307 begalt = b;
3308 /* If we've reached MAX_REGNUM groups, then this open
3309 won't actually generate any code, so we'll have to
3310 clear pending_exact explicitly. */
3311 pending_exact = 0;
3312 break;
3313 }
3314
3315 case ')':
3316 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
3317
3318 if (COMPILE_STACK_EMPTY)
3319 {
3320 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3321 goto normal_backslash;
3322 else
3323 FREE_STACK_RETURN (REG_ERPAREN);
3324 }
3325
3326 handle_close:
3327 FIXUP_ALT_JUMP ();
3328
3329 /* See similar code for backslashed left paren above. */
3330 if (COMPILE_STACK_EMPTY)
3331 {
3332 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
3333 goto normal_char;
3334 else
3335 FREE_STACK_RETURN (REG_ERPAREN);
3336 }
3337
3338 /* Since we just checked for an empty stack above, this
3339 ``can't happen''. */
3340 assert (compile_stack.avail != 0);
3341 {
3342 /* We don't just want to restore into `regnum', because
3343 later groups should continue to be numbered higher,
3344 as in `(ab)c(de)' -- the second group is #2. */
3345 regnum_t regnum;
3346
3347 compile_stack.avail--;
3348 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
3349 fixup_alt_jump
3350 = COMPILE_STACK_TOP.fixup_alt_jump
3351 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
3352 : 0;
3353 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
3354 regnum = COMPILE_STACK_TOP.regnum;
3355 /* If we've reached MAX_REGNUM groups, then this open
3356 won't actually generate any code, so we'll have to
3357 clear pending_exact explicitly. */
3358 pending_exact = 0;
3359
3360 /* We're at the end of the group, so now we know how many
3361 groups were inside this one. */
3362 if (regnum <= MAX_REGNUM && regnum > 0)
3363 BUF_PUSH_2 (stop_memory, regnum);
3364 }
3365 break;
3366
3367
3368 case '|': /* `\|'. */
3369 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
3370 goto normal_backslash;
3371 handle_alt:
3372 if (syntax & RE_LIMITED_OPS)
3373 goto normal_char;
3374
3375 /* Insert before the previous alternative a jump which
3376 jumps to this alternative if the former fails. */
3377 GET_BUFFER_SPACE (3);
3378 INSERT_JUMP (on_failure_jump, begalt, b + 6);
3379 pending_exact = 0;
3380 b += 3;
3381
3382 /* The alternative before this one has a jump after it
3383 which gets executed if it gets matched. Adjust that
3384 jump so it will jump to this alternative's analogous
3385 jump (put in below, which in turn will jump to the next
3386 (if any) alternative's such jump, etc.). The last such
3387 jump jumps to the correct final destination. A picture:
3388 _____ _____
3389 | | | |
3390 | v | v
3391 a | b | c
3392
3393 If we are at `b', then fixup_alt_jump right now points to a
3394 three-byte space after `a'. We'll put in the jump, set
3395 fixup_alt_jump to right after `b', and leave behind three
3396 bytes which we'll fill in when we get to after `c'. */
3397
3398 FIXUP_ALT_JUMP ();
3399
3400 /* Mark and leave space for a jump after this alternative,
3401 to be filled in later either by next alternative or
3402 when know we're at the end of a series of alternatives. */
3403 fixup_alt_jump = b;
3404 GET_BUFFER_SPACE (3);
3405 b += 3;
3406
3407 laststart = 0;
3408 begalt = b;
3409 break;
3410
3411
3412 case '{':
3413 /* If \{ is a literal. */
3414 if (!(syntax & RE_INTERVALS)
3415 /* If we're at `\{' and it's not the open-interval
3416 operator. */
3417 || (syntax & RE_NO_BK_BRACES))
3418 goto normal_backslash;
3419
3420 handle_interval:
3421 {
3422 /* If got here, then the syntax allows intervals. */
3423
3424 /* At least (most) this many matches must be made. */
3425 int lower_bound = 0, upper_bound = -1;
3426
3427 beg_interval = p;
3428
3429 GET_UNSIGNED_NUMBER (lower_bound);
3430
3431 if (c == ',')
3432 GET_UNSIGNED_NUMBER (upper_bound);
3433 else
3434 /* Interval such as `{1}' => match exactly once. */
3435 upper_bound = lower_bound;
3436
3437 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
3438 || (upper_bound >= 0 && lower_bound > upper_bound))
3439 FREE_STACK_RETURN (REG_BADBR);
3440
3441 if (!(syntax & RE_NO_BK_BRACES))
3442 {
3443 if (c != '\\')
3444 FREE_STACK_RETURN (REG_BADBR);
3445 if (p == pend)
3446 FREE_STACK_RETURN (REG_EESCAPE);
3447 PATFETCH (c);
3448 }
3449
3450 if (c != '}')
3451 FREE_STACK_RETURN (REG_BADBR);
3452
3453 /* We just parsed a valid interval. */
3454
3455 /* If it's invalid to have no preceding re. */
3456 if (!laststart)
3457 {
3458 if (syntax & RE_CONTEXT_INVALID_OPS)
3459 FREE_STACK_RETURN (REG_BADRPT);
3460 else if (syntax & RE_CONTEXT_INDEP_OPS)
3461 laststart = b;
3462 else
3463 goto unfetch_interval;
3464 }
3465
3466 if (upper_bound == 0)
3467 /* If the upper bound is zero, just drop the sub pattern
3468 altogether. */
3469 b = laststart;
3470 else if (lower_bound == 1 && upper_bound == 1)
3471 /* Just match it once: nothing to do here. */
3472 ;
3473
3474 /* Otherwise, we have a nontrivial interval. When
3475 we're all done, the pattern will look like:
3476 set_number_at <jump count> <upper bound>
3477 set_number_at <succeed_n count> <lower bound>
3478 succeed_n <after jump addr> <succeed_n count>
3479 <body of loop>
3480 jump_n <succeed_n addr> <jump count>
3481 (The upper bound and `jump_n' are omitted if
3482 `upper_bound' is 1, though.) */
3483 else
3484 { /* If the upper bound is > 1, we need to insert
3485 more at the end of the loop. */
3486 unsigned int nbytes = (upper_bound < 0 ? 3
3487 : upper_bound > 1 ? 5 : 0);
3488 unsigned int startoffset = 0;
3489
3490 GET_BUFFER_SPACE (20); /* We might use less. */
3491
3492 if (lower_bound == 0)
3493 {
3494 /* A succeed_n that starts with 0 is really a
3495 a simple on_failure_jump_loop. */
3496 INSERT_JUMP (on_failure_jump_loop, laststart,
3497 b + 3 + nbytes);
3498 b += 3;
3499 }
3500 else
3501 {
3502 /* Initialize lower bound of the `succeed_n', even
3503 though it will be set during matching by its
3504 attendant `set_number_at' (inserted next),
3505 because `re_compile_fastmap' needs to know.
3506 Jump to the `jump_n' we might insert below. */
3507 INSERT_JUMP2 (succeed_n, laststart,
3508 b + 5 + nbytes,
3509 lower_bound);
3510 b += 5;
3511
3512 /* Code to initialize the lower bound. Insert
3513 before the `succeed_n'. The `5' is the last two
3514 bytes of this `set_number_at', plus 3 bytes of
3515 the following `succeed_n'. */
3516 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
3517 b += 5;
3518 startoffset += 5;
3519 }
3520
3521 if (upper_bound < 0)
3522 {
3523 /* A negative upper bound stands for infinity,
3524 in which case it degenerates to a plain jump. */
3525 STORE_JUMP (jump, b, laststart + startoffset);
3526 b += 3;
3527 }
3528 else if (upper_bound > 1)
3529 { /* More than one repetition is allowed, so
3530 append a backward jump to the `succeed_n'
3531 that starts this interval.
3532
3533 When we've reached this during matching,
3534 we'll have matched the interval once, so
3535 jump back only `upper_bound - 1' times. */
3536 STORE_JUMP2 (jump_n, b, laststart + startoffset,
3537 upper_bound - 1);
3538 b += 5;
3539
3540 /* The location we want to set is the second
3541 parameter of the `jump_n'; that is `b-2' as
3542 an absolute address. `laststart' will be
3543 the `set_number_at' we're about to insert;
3544 `laststart+3' the number to set, the source
3545 for the relative address. But we are
3546 inserting into the middle of the pattern --
3547 so everything is getting moved up by 5.
3548 Conclusion: (b - 2) - (laststart + 3) + 5,
3549 i.e., b - laststart.
3550
3551 We insert this at the beginning of the loop
3552 so that if we fail during matching, we'll
3553 reinitialize the bounds. */
3554 insert_op2 (set_number_at, laststart, b - laststart,
3555 upper_bound - 1, b);
3556 b += 5;
3557 }
3558 }
3559 pending_exact = 0;
3560 beg_interval = NULL;
3561 }
3562 break;
3563
3564 unfetch_interval:
3565 /* If an invalid interval, match the characters as literals. */
3566 assert (beg_interval);
3567 p = beg_interval;
3568 beg_interval = NULL;
3569
3570 /* normal_char and normal_backslash need `c'. */
3571 c = '{';
3572
3573 if (!(syntax & RE_NO_BK_BRACES))
3574 {
3575 assert (p > pattern && p[-1] == '\\');
3576 goto normal_backslash;
3577 }
3578 else
3579 goto normal_char;
3580
3581 #ifdef emacs
3582 /* There is no way to specify the before_dot and after_dot
3583 operators. rms says this is ok. --karl */
3584 case '=':
3585 BUF_PUSH (at_dot);
3586 break;
3587
3588 case 's':
3589 laststart = b;
3590 PATFETCH (c);
3591 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
3592 break;
3593
3594 case 'S':
3595 laststart = b;
3596 PATFETCH (c);
3597 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
3598 break;
3599
3600 case 'c':
3601 laststart = b;
3602 PATFETCH (c);
3603 BUF_PUSH_2 (categoryspec, c);
3604 break;
3605
3606 case 'C':
3607 laststart = b;
3608 PATFETCH (c);
3609 BUF_PUSH_2 (notcategoryspec, c);
3610 break;
3611 #endif /* emacs */
3612
3613
3614 case 'w':
3615 if (syntax & RE_NO_GNU_OPS)
3616 goto normal_char;
3617 laststart = b;
3618 BUF_PUSH_2 (syntaxspec, Sword);
3619 break;
3620
3621
3622 case 'W':
3623 if (syntax & RE_NO_GNU_OPS)
3624 goto normal_char;
3625 laststart = b;
3626 BUF_PUSH_2 (notsyntaxspec, Sword);
3627 break;
3628
3629
3630 case '<':
3631 if (syntax & RE_NO_GNU_OPS)
3632 goto normal_char;
3633 BUF_PUSH (wordbeg);
3634 break;
3635
3636 case '>':
3637 if (syntax & RE_NO_GNU_OPS)
3638 goto normal_char;
3639 BUF_PUSH (wordend);
3640 break;
3641
3642 case '_':
3643 if (syntax & RE_NO_GNU_OPS)
3644 goto normal_char;
3645 laststart = b;
3646 PATFETCH (c);
3647 if (c == '<')
3648 BUF_PUSH (symbeg);
3649 else if (c == '>')
3650 BUF_PUSH (symend);
3651 else
3652 FREE_STACK_RETURN (REG_BADPAT);
3653 break;
3654
3655 case 'b':
3656 if (syntax & RE_NO_GNU_OPS)
3657 goto normal_char;
3658 BUF_PUSH (wordbound);
3659 break;
3660
3661 case 'B':
3662 if (syntax & RE_NO_GNU_OPS)
3663 goto normal_char;
3664 BUF_PUSH (notwordbound);
3665 break;
3666
3667 case '`':
3668 if (syntax & RE_NO_GNU_OPS)
3669 goto normal_char;
3670 BUF_PUSH (begbuf);
3671 break;
3672
3673 case '\'':
3674 if (syntax & RE_NO_GNU_OPS)
3675 goto normal_char;
3676 BUF_PUSH (endbuf);
3677 break;
3678
3679 case '1': case '2': case '3': case '4': case '5':
3680 case '6': case '7': case '8': case '9':
3681 {
3682 regnum_t reg;
3683
3684 if (syntax & RE_NO_BK_REFS)
3685 goto normal_backslash;
3686
3687 reg = c - '0';
3688
3689 if (reg > bufp->re_nsub || reg < 1
3690 /* Can't back reference to a subexp before its end. */
3691 || group_in_compile_stack (compile_stack, reg))
3692 FREE_STACK_RETURN (REG_ESUBREG);
3693
3694 laststart = b;
3695 BUF_PUSH_2 (duplicate, reg);
3696 }
3697 break;
3698
3699
3700 case '+':
3701 case '?':
3702 if (syntax & RE_BK_PLUS_QM)
3703 goto handle_plus;
3704 else
3705 goto normal_backslash;
3706
3707 default:
3708 normal_backslash:
3709 /* You might think it would be useful for \ to mean
3710 not to translate; but if we don't translate it
3711 it will never match anything. */
3712 goto normal_char;
3713 }
3714 break;
3715
3716
3717 default:
3718 /* Expects the character in `c'. */
3719 normal_char:
3720 /* If no exactn currently being built. */
3721 if (!pending_exact
3722
3723 /* If last exactn not at current position. */
3724 || pending_exact + *pending_exact + 1 != b
3725
3726 /* We have only one byte following the exactn for the count. */
3727 || *pending_exact >= (1 << BYTEWIDTH) - MAX_MULTIBYTE_LENGTH
3728
3729 /* If followed by a repetition operator. */
3730 || (p != pend && (*p == '*' || *p == '^'))
3731 || ((syntax & RE_BK_PLUS_QM)
3732 ? p + 1 < pend && *p == '\\' && (p[1] == '+' || p[1] == '?')
3733 : p != pend && (*p == '+' || *p == '?'))
3734 || ((syntax & RE_INTERVALS)
3735 && ((syntax & RE_NO_BK_BRACES)
3736 ? p != pend && *p == '{'
3737 : p + 1 < pend && p[0] == '\\' && p[1] == '{')))
3738 {
3739 /* Start building a new exactn. */
3740
3741 laststart = b;
3742
3743 BUF_PUSH_2 (exactn, 0);
3744 pending_exact = b - 1;
3745 }
3746
3747 GET_BUFFER_SPACE (MAX_MULTIBYTE_LENGTH);
3748 {
3749 int len;
3750
3751 if (multibyte)
3752 {
3753 c = TRANSLATE (c);
3754 len = CHAR_STRING (c, b);
3755 b += len;
3756 }
3757 else
3758 {
3759 c1 = RE_CHAR_TO_MULTIBYTE (c);
3760 if (! CHAR_BYTE8_P (c1))
3761 {
3762 re_wchar_t c2 = TRANSLATE (c1);
3763
3764 if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0)
3765 c = c1;
3766 }
3767 *b++ = c;
3768 len = 1;
3769 }
3770 (*pending_exact) += len;
3771 }
3772
3773 break;
3774 } /* switch (c) */
3775 } /* while p != pend */
3776
3777
3778 /* Through the pattern now. */
3779
3780 FIXUP_ALT_JUMP ();
3781
3782 if (!COMPILE_STACK_EMPTY)
3783 FREE_STACK_RETURN (REG_EPAREN);
3784
3785 /* If we don't want backtracking, force success
3786 the first time we reach the end of the compiled pattern. */
3787 if (syntax & RE_NO_POSIX_BACKTRACKING)
3788 BUF_PUSH (succeed);
3789
3790 /* We have succeeded; set the length of the buffer. */
3791 bufp->used = b - bufp->buffer;
3792
3793 #ifdef DEBUG
3794 if (debug > 0)
3795 {
3796 re_compile_fastmap (bufp);
3797 DEBUG_PRINT1 ("\nCompiled pattern: \n");
3798 print_compiled_pattern (bufp);
3799 }
3800 debug--;
3801 #endif /* DEBUG */
3802
3803 #ifndef MATCH_MAY_ALLOCATE
3804 /* Initialize the failure stack to the largest possible stack. This
3805 isn't necessary unless we're trying to avoid calling alloca in
3806 the search and match routines. */
3807 {
3808 int num_regs = bufp->re_nsub + 1;
3809
3810 if (fail_stack.size < re_max_failures * TYPICAL_FAILURE_SIZE)
3811 {
3812 fail_stack.size = re_max_failures * TYPICAL_FAILURE_SIZE;
3813
3814 if (! fail_stack.stack)
3815 fail_stack.stack
3816 = (fail_stack_elt_t *) malloc (fail_stack.size
3817 * sizeof (fail_stack_elt_t));
3818 else
3819 fail_stack.stack
3820 = (fail_stack_elt_t *) realloc (fail_stack.stack,
3821 (fail_stack.size
3822 * sizeof (fail_stack_elt_t)));
3823 }
3824
3825 regex_grow_registers (num_regs);
3826 }
3827 #endif /* not MATCH_MAY_ALLOCATE */
3828
3829 FREE_STACK_RETURN (REG_NOERROR);
3830 } /* regex_compile */
3831 \f
3832 /* Subroutines for `regex_compile'. */
3833
3834 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3835
3836 static void
3837 store_op1 (re_opcode_t op, unsigned char *loc, int arg)
3838 {
3839 *loc = (unsigned char) op;
3840 STORE_NUMBER (loc + 1, arg);
3841 }
3842
3843
3844 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3845
3846 static void
3847 store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3848 {
3849 *loc = (unsigned char) op;
3850 STORE_NUMBER (loc + 1, arg1);
3851 STORE_NUMBER (loc + 3, arg2);
3852 }
3853
3854
3855 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3856 for OP followed by two-byte integer parameter ARG. */
3857
3858 static void
3859 insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3860 {
3861 register unsigned char *pfrom = end;
3862 register unsigned char *pto = end + 3;
3863
3864 while (pfrom != loc)
3865 *--pto = *--pfrom;
3866
3867 store_op1 (op, loc, arg);
3868 }
3869
3870
3871 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3872
3873 static void
3874 insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end)
3875 {
3876 register unsigned char *pfrom = end;
3877 register unsigned char *pto = end + 5;
3878
3879 while (pfrom != loc)
3880 *--pto = *--pfrom;
3881
3882 store_op2 (op, loc, arg1, arg2);
3883 }
3884
3885
3886 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3887 after an alternative or a begin-subexpression. We assume there is at
3888 least one character before the ^. */
3889
3890 static boolean
3891 at_begline_loc_p (const re_char *pattern, const re_char *p, reg_syntax_t syntax)
3892 {
3893 re_char *prev = p - 2;
3894 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3895
3896 return
3897 /* After a subexpression? */
3898 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3899 /* After an alternative? */
3900 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
3901 /* After a shy subexpression? */
3902 || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
3903 && prev[-1] == '?' && prev[-2] == '('
3904 && (syntax & RE_NO_BK_PARENS
3905 || (prev - 3 >= pattern && prev[-3] == '\\')));
3906 }
3907
3908
3909 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3910 at least one character after the $, i.e., `P < PEND'. */
3911
3912 static boolean
3913 at_endline_loc_p (const re_char *p, const re_char *pend, reg_syntax_t syntax)
3914 {
3915 re_char *next = p;
3916 boolean next_backslash = *next == '\\';
3917 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3918
3919 return
3920 /* Before a subexpression? */
3921 (syntax & RE_NO_BK_PARENS ? *next == ')'
3922 : next_backslash && next_next && *next_next == ')')
3923 /* Before an alternative? */
3924 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3925 : next_backslash && next_next && *next_next == '|');
3926 }
3927
3928
3929 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3930 false if it's not. */
3931
3932 static boolean
3933 group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum)
3934 {
3935 int this_element;
3936
3937 for (this_element = compile_stack.avail - 1;
3938 this_element >= 0;
3939 this_element--)
3940 if (compile_stack.stack[this_element].regnum == regnum)
3941 return true;
3942
3943 return false;
3944 }
3945 \f
3946 /* analyse_first.
3947 If fastmap is non-NULL, go through the pattern and fill fastmap
3948 with all the possible leading chars. If fastmap is NULL, don't
3949 bother filling it up (obviously) and only return whether the
3950 pattern could potentially match the empty string.
3951
3952 Return 1 if p..pend might match the empty string.
3953 Return 0 if p..pend matches at least one char.
3954 Return -1 if fastmap was not updated accurately. */
3955
3956 static int
3957 analyse_first (const re_char *p, const re_char *pend, char *fastmap, const int multibyte)
3958 {
3959 int j, k;
3960 boolean not;
3961
3962 /* If all elements for base leading-codes in fastmap is set, this
3963 flag is set true. */
3964 boolean match_any_multibyte_characters = false;
3965
3966 assert (p);
3967
3968 /* The loop below works as follows:
3969 - It has a working-list kept in the PATTERN_STACK and which basically
3970 starts by only containing a pointer to the first operation.
3971 - If the opcode we're looking at is a match against some set of
3972 chars, then we add those chars to the fastmap and go on to the
3973 next work element from the worklist (done via `break').
3974 - If the opcode is a control operator on the other hand, we either
3975 ignore it (if it's meaningless at this point, such as `start_memory')
3976 or execute it (if it's a jump). If the jump has several destinations
3977 (i.e. `on_failure_jump'), then we push the other destination onto the
3978 worklist.
3979 We guarantee termination by ignoring backward jumps (more or less),
3980 so that `p' is monotonically increasing. More to the point, we
3981 never set `p' (or push) anything `<= p1'. */
3982
3983 while (p < pend)
3984 {
3985 /* `p1' is used as a marker of how far back a `on_failure_jump'
3986 can go without being ignored. It is normally equal to `p'
3987 (which prevents any backward `on_failure_jump') except right
3988 after a plain `jump', to allow patterns such as:
3989 0: jump 10
3990 3..9: <body>
3991 10: on_failure_jump 3
3992 as used for the *? operator. */
3993 re_char *p1 = p;
3994
3995 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
3996 {
3997 case succeed:
3998 return 1;
3999
4000 case duplicate:
4001 /* If the first character has to match a backreference, that means
4002 that the group was empty (since it already matched). Since this
4003 is the only case that interests us here, we can assume that the
4004 backreference must match the empty string. */
4005 p++;
4006 continue;
4007
4008
4009 /* Following are the cases which match a character. These end
4010 with `break'. */
4011
4012 case exactn:
4013 if (fastmap)
4014 {
4015 /* If multibyte is nonzero, the first byte of each
4016 character is an ASCII or a leading code. Otherwise,
4017 each byte is a character. Thus, this works in both
4018 cases. */
4019 fastmap[p[1]] = 1;
4020 if (! multibyte)
4021 {
4022 /* For the case of matching this unibyte regex
4023 against multibyte, we must set a leading code of
4024 the corresponding multibyte character. */
4025 int c = RE_CHAR_TO_MULTIBYTE (p[1]);
4026
4027 fastmap[CHAR_LEADING_CODE (c)] = 1;
4028 }
4029 }
4030 break;
4031
4032
4033 case anychar:
4034 /* We could put all the chars except for \n (and maybe \0)
4035 but we don't bother since it is generally not worth it. */
4036 if (!fastmap) break;
4037 return -1;
4038
4039
4040 case charset_not:
4041 if (!fastmap) break;
4042 {
4043 /* Chars beyond end of bitmap are possible matches. */
4044 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
4045 j < (1 << BYTEWIDTH); j++)
4046 fastmap[j] = 1;
4047 }
4048
4049 /* Fallthrough */
4050 case charset:
4051 if (!fastmap) break;
4052 not = (re_opcode_t) *(p - 1) == charset_not;
4053 for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
4054 j >= 0; j--)
4055 if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
4056 fastmap[j] = 1;
4057
4058 #ifdef emacs
4059 if (/* Any leading code can possibly start a character
4060 which doesn't match the specified set of characters. */
4061 not
4062 ||
4063 /* If we can match a character class, we can match any
4064 multibyte characters. */
4065 (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4066 && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0))
4067
4068 {
4069 if (match_any_multibyte_characters == false)
4070 {
4071 for (j = MIN_MULTIBYTE_LEADING_CODE;
4072 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4073 fastmap[j] = 1;
4074 match_any_multibyte_characters = true;
4075 }
4076 }
4077
4078 else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2])
4079 && match_any_multibyte_characters == false)
4080 {
4081 /* Set fastmap[I] to 1 where I is a leading code of each
4082 multibyte character in the range table. */
4083 int c, count;
4084 unsigned char lc1, lc2;
4085
4086 /* Make P points the range table. `+ 2' is to skip flag
4087 bits for a character class. */
4088 p += CHARSET_BITMAP_SIZE (&p[-2]) + 2;
4089
4090 /* Extract the number of ranges in range table into COUNT. */
4091 EXTRACT_NUMBER_AND_INCR (count, p);
4092 for (; count > 0; count--, p += 3)
4093 {
4094 /* Extract the start and end of each range. */
4095 EXTRACT_CHARACTER (c, p);
4096 lc1 = CHAR_LEADING_CODE (c);
4097 p += 3;
4098 EXTRACT_CHARACTER (c, p);
4099 lc2 = CHAR_LEADING_CODE (c);
4100 for (j = lc1; j <= lc2; j++)
4101 fastmap[j] = 1;
4102 }
4103 }
4104 #endif
4105 break;
4106
4107 case syntaxspec:
4108 case notsyntaxspec:
4109 if (!fastmap) break;
4110 #ifndef emacs
4111 not = (re_opcode_t)p[-1] == notsyntaxspec;
4112 k = *p++;
4113 for (j = 0; j < (1 << BYTEWIDTH); j++)
4114 if ((SYNTAX (j) == (enum syntaxcode) k) ^ not)
4115 fastmap[j] = 1;
4116 break;
4117 #else /* emacs */
4118 /* This match depends on text properties. These end with
4119 aborting optimizations. */
4120 return -1;
4121
4122 case categoryspec:
4123 case notcategoryspec:
4124 if (!fastmap) break;
4125 not = (re_opcode_t)p[-1] == notcategoryspec;
4126 k = *p++;
4127 for (j = (1 << BYTEWIDTH); j >= 0; j--)
4128 if ((CHAR_HAS_CATEGORY (j, k)) ^ not)
4129 fastmap[j] = 1;
4130
4131 /* Any leading code can possibly start a character which
4132 has or doesn't has the specified category. */
4133 if (match_any_multibyte_characters == false)
4134 {
4135 for (j = MIN_MULTIBYTE_LEADING_CODE;
4136 j <= MAX_MULTIBYTE_LEADING_CODE; j++)
4137 fastmap[j] = 1;
4138 match_any_multibyte_characters = true;
4139 }
4140 break;
4141
4142 /* All cases after this match the empty string. These end with
4143 `continue'. */
4144
4145 case before_dot:
4146 case at_dot:
4147 case after_dot:
4148 #endif /* !emacs */
4149 case no_op:
4150 case begline:
4151 case endline:
4152 case begbuf:
4153 case endbuf:
4154 case wordbound:
4155 case notwordbound:
4156 case wordbeg:
4157 case wordend:
4158 case symbeg:
4159 case symend:
4160 continue;
4161
4162
4163 case jump:
4164 EXTRACT_NUMBER_AND_INCR (j, p);
4165 if (j < 0)
4166 /* Backward jumps can only go back to code that we've already
4167 visited. `re_compile' should make sure this is true. */
4168 break;
4169 p += j;
4170 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4171 {
4172 case on_failure_jump:
4173 case on_failure_keep_string_jump:
4174 case on_failure_jump_loop:
4175 case on_failure_jump_nastyloop:
4176 case on_failure_jump_smart:
4177 p++;
4178 break;
4179 default:
4180 continue;
4181 };
4182 /* Keep `p1' to allow the `on_failure_jump' we are jumping to
4183 to jump back to "just after here". */
4184 /* Fallthrough */
4185
4186 case on_failure_jump:
4187 case on_failure_keep_string_jump:
4188 case on_failure_jump_nastyloop:
4189 case on_failure_jump_loop:
4190 case on_failure_jump_smart:
4191 EXTRACT_NUMBER_AND_INCR (j, p);
4192 if (p + j <= p1)
4193 ; /* Backward jump to be ignored. */
4194 else
4195 { /* We have to look down both arms.
4196 We first go down the "straight" path so as to minimize
4197 stack usage when going through alternatives. */
4198 int r = analyse_first (p, pend, fastmap, multibyte);
4199 if (r) return r;
4200 p += j;
4201 }
4202 continue;
4203
4204
4205 case jump_n:
4206 /* This code simply does not properly handle forward jump_n. */
4207 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p); assert (j < 0));
4208 p += 4;
4209 /* jump_n can either jump or fall through. The (backward) jump
4210 case has already been handled, so we only need to look at the
4211 fallthrough case. */
4212 continue;
4213
4214 case succeed_n:
4215 /* If N == 0, it should be an on_failure_jump_loop instead. */
4216 DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); assert (j > 0));
4217 p += 4;
4218 /* We only care about one iteration of the loop, so we don't
4219 need to consider the case where this behaves like an
4220 on_failure_jump. */
4221 continue;
4222
4223
4224 case set_number_at:
4225 p += 4;
4226 continue;
4227
4228
4229 case start_memory:
4230 case stop_memory:
4231 p += 1;
4232 continue;
4233
4234
4235 default:
4236 abort (); /* We have listed all the cases. */
4237 } /* switch *p++ */
4238
4239 /* Getting here means we have found the possible starting
4240 characters for one path of the pattern -- and that the empty
4241 string does not match. We need not follow this path further. */
4242 return 0;
4243 } /* while p */
4244
4245 /* We reached the end without matching anything. */
4246 return 1;
4247
4248 } /* analyse_first */
4249 \f
4250 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
4251 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
4252 characters can start a string that matches the pattern. This fastmap
4253 is used by re_search to skip quickly over impossible starting points.
4254
4255 Character codes above (1 << BYTEWIDTH) are not represented in the
4256 fastmap, but the leading codes are represented. Thus, the fastmap
4257 indicates which character sets could start a match.
4258
4259 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
4260 area as BUFP->fastmap.
4261
4262 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
4263 the pattern buffer.
4264
4265 Returns 0 if we succeed, -2 if an internal error. */
4266
4267 int
4268 re_compile_fastmap (struct re_pattern_buffer *bufp)
4269 {
4270 char *fastmap = bufp->fastmap;
4271 int analysis;
4272
4273 assert (fastmap && bufp->buffer);
4274
4275 memset (fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
4276 bufp->fastmap_accurate = 1; /* It will be when we're done. */
4277
4278 analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,
4279 fastmap, RE_MULTIBYTE_P (bufp));
4280 bufp->can_be_null = (analysis != 0);
4281 return 0;
4282 } /* re_compile_fastmap */
4283 \f
4284 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4285 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4286 this memory for recording register information. STARTS and ENDS
4287 must be allocated using the malloc library routine, and must each
4288 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4289
4290 If NUM_REGS == 0, then subsequent matches should allocate their own
4291 register data.
4292
4293 Unless this function is called, the first search or match using
4294 PATTERN_BUFFER will allocate its own register data, without
4295 freeing the old data. */
4296
4297 void
4298 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs, unsigned int num_regs, regoff_t *starts, regoff_t *ends)
4299 {
4300 if (num_regs)
4301 {
4302 bufp->regs_allocated = REGS_REALLOCATE;
4303 regs->num_regs = num_regs;
4304 regs->start = starts;
4305 regs->end = ends;
4306 }
4307 else
4308 {
4309 bufp->regs_allocated = REGS_UNALLOCATED;
4310 regs->num_regs = 0;
4311 regs->start = regs->end = (regoff_t *) 0;
4312 }
4313 }
4314 WEAK_ALIAS (__re_set_registers, re_set_registers)
4315 \f
4316 /* Searching routines. */
4317
4318 /* Like re_search_2, below, but only one string is specified, and
4319 doesn't let you say where to stop matching. */
4320
4321 int
4322 re_search (struct re_pattern_buffer *bufp, const char *string, int size, int startpos, int range, struct re_registers *regs)
4323 {
4324 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
4325 regs, size);
4326 }
4327 WEAK_ALIAS (__re_search, re_search)
4328
4329 /* Head address of virtual concatenation of string. */
4330 #define HEAD_ADDR_VSTRING(P) \
4331 (((P) >= size1 ? string2 : string1))
4332
4333 /* End address of virtual concatenation of string. */
4334 #define STOP_ADDR_VSTRING(P) \
4335 (((P) >= size1 ? string2 + size2 : string1 + size1))
4336
4337 /* Address of POS in the concatenation of virtual string. */
4338 #define POS_ADDR_VSTRING(POS) \
4339 (((POS) >= size1 ? string2 - size1 : string1) + (POS))
4340
4341 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4342 virtual concatenation of STRING1 and STRING2, starting first at index
4343 STARTPOS, then at STARTPOS + 1, and so on.
4344
4345 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4346
4347 RANGE is how far to scan while trying to match. RANGE = 0 means try
4348 only at STARTPOS; in general, the last start tried is STARTPOS +
4349 RANGE.
4350
4351 In REGS, return the indices of the virtual concatenation of STRING1
4352 and STRING2 that matched the entire BUFP->buffer and its contained
4353 subexpressions.
4354
4355 Do not consider matching one past the index STOP in the virtual
4356 concatenation of STRING1 and STRING2.
4357
4358 We return either the position in the strings at which the match was
4359 found, -1 if no match, or -2 if error (such as failure
4360 stack overflow). */
4361
4362 int
4363 re_search_2 (struct re_pattern_buffer *bufp, const char *str1, int size1, const char *str2, int size2, int startpos, int range, struct re_registers *regs, int stop)
4364 {
4365 int val;
4366 re_char *string1 = (re_char*) str1;
4367 re_char *string2 = (re_char*) str2;
4368 register char *fastmap = bufp->fastmap;
4369 register RE_TRANSLATE_TYPE translate = bufp->translate;
4370 int total_size = size1 + size2;
4371 int endpos = startpos + range;
4372 boolean anchored_start;
4373 /* Nonzero if we are searching multibyte string. */
4374 const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp);
4375
4376 /* Check for out-of-range STARTPOS. */
4377 if (startpos < 0 || startpos > total_size)
4378 return -1;
4379
4380 /* Fix up RANGE if it might eventually take us outside
4381 the virtual concatenation of STRING1 and STRING2.
4382 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */
4383 if (endpos < 0)
4384 range = 0 - startpos;
4385 else if (endpos > total_size)
4386 range = total_size - startpos;
4387
4388 /* If the search isn't to be a backwards one, don't waste time in a
4389 search for a pattern anchored at beginning of buffer. */
4390 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
4391 {
4392 if (startpos > 0)
4393 return -1;
4394 else
4395 range = 0;
4396 }
4397
4398 #ifdef emacs
4399 /* In a forward search for something that starts with \=.
4400 don't keep searching past point. */
4401 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0)
4402 {
4403 range = PT_BYTE - BEGV_BYTE - startpos;
4404 if (range < 0)
4405 return -1;
4406 }
4407 #endif /* emacs */
4408
4409 /* Update the fastmap now if not correct already. */
4410 if (fastmap && !bufp->fastmap_accurate)
4411 re_compile_fastmap (bufp);
4412
4413 /* See whether the pattern is anchored. */
4414 anchored_start = (bufp->buffer[0] == begline);
4415
4416 #ifdef emacs
4417 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
4418 {
4419 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (startpos));
4420
4421 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
4422 }
4423 #endif
4424
4425 /* Loop through the string, looking for a place to start matching. */
4426 for (;;)
4427 {
4428 /* If the pattern is anchored,
4429 skip quickly past places we cannot match.
4430 We don't bother to treat startpos == 0 specially
4431 because that case doesn't repeat. */
4432 if (anchored_start && startpos > 0)
4433 {
4434 if (! ((startpos <= size1 ? string1[startpos - 1]
4435 : string2[startpos - size1 - 1])
4436 == '\n'))
4437 goto advance;
4438 }
4439
4440 /* If a fastmap is supplied, skip quickly over characters that
4441 cannot be the start of a match. If the pattern can match the
4442 null string, however, we don't need to skip characters; we want
4443 the first null string. */
4444 if (fastmap && startpos < total_size && !bufp->can_be_null)
4445 {
4446 register re_char *d;
4447 register re_wchar_t buf_ch;
4448
4449 d = POS_ADDR_VSTRING (startpos);
4450
4451 if (range > 0) /* Searching forwards. */
4452 {
4453 register int lim = 0;
4454 int irange = range;
4455
4456 if (startpos < size1 && startpos + range >= size1)
4457 lim = range - (size1 - startpos);
4458
4459 /* Written out as an if-else to avoid testing `translate'
4460 inside the loop. */
4461 if (RE_TRANSLATE_P (translate))
4462 {
4463 if (multibyte)
4464 while (range > lim)
4465 {
4466 int buf_charlen;
4467
4468 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
4469 buf_ch = RE_TRANSLATE (translate, buf_ch);
4470 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4471 break;
4472
4473 range -= buf_charlen;
4474 d += buf_charlen;
4475 }
4476 else
4477 while (range > lim)
4478 {
4479 register re_wchar_t ch, translated;
4480
4481 buf_ch = *d;
4482 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4483 translated = RE_TRANSLATE (translate, ch);
4484 if (translated != ch
4485 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4486 buf_ch = ch;
4487 if (fastmap[buf_ch])
4488 break;
4489 d++;
4490 range--;
4491 }
4492 }
4493 else
4494 {
4495 if (multibyte)
4496 while (range > lim)
4497 {
4498 int buf_charlen;
4499
4500 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
4501 if (fastmap[CHAR_LEADING_CODE (buf_ch)])
4502 break;
4503 range -= buf_charlen;
4504 d += buf_charlen;
4505 }
4506 else
4507 while (range > lim && !fastmap[*d])
4508 {
4509 d++;
4510 range--;
4511 }
4512 }
4513 startpos += irange - range;
4514 }
4515 else /* Searching backwards. */
4516 {
4517 if (multibyte)
4518 {
4519 buf_ch = STRING_CHAR (d);
4520 buf_ch = TRANSLATE (buf_ch);
4521 if (! fastmap[CHAR_LEADING_CODE (buf_ch)])
4522 goto advance;
4523 }
4524 else
4525 {
4526 register re_wchar_t ch, translated;
4527
4528 buf_ch = *d;
4529 ch = RE_CHAR_TO_MULTIBYTE (buf_ch);
4530 translated = TRANSLATE (ch);
4531 if (translated != ch
4532 && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0)
4533 buf_ch = ch;
4534 if (! fastmap[TRANSLATE (buf_ch)])
4535 goto advance;
4536 }
4537 }
4538 }
4539
4540 /* If can't match the null string, and that's all we have left, fail. */
4541 if (range >= 0 && startpos == total_size && fastmap
4542 && !bufp->can_be_null)
4543 return -1;
4544
4545 val = re_match_2_internal (bufp, string1, size1, string2, size2,
4546 startpos, regs, stop);
4547
4548 if (val >= 0)
4549 return startpos;
4550
4551 if (val == -2)
4552 return -2;
4553
4554 advance:
4555 if (!range)
4556 break;
4557 else if (range > 0)
4558 {
4559 /* Update STARTPOS to the next character boundary. */
4560 if (multibyte)
4561 {
4562 re_char *p = POS_ADDR_VSTRING (startpos);
4563 re_char *pend = STOP_ADDR_VSTRING (startpos);
4564 int len = BYTES_BY_CHAR_HEAD (*p);
4565
4566 range -= len;
4567 if (range < 0)
4568 break;
4569 startpos += len;
4570 }
4571 else
4572 {
4573 range--;
4574 startpos++;
4575 }
4576 }
4577 else
4578 {
4579 range++;
4580 startpos--;
4581
4582 /* Update STARTPOS to the previous character boundary. */
4583 if (multibyte)
4584 {
4585 re_char *p = POS_ADDR_VSTRING (startpos) + 1;
4586 re_char *p0 = p;
4587 re_char *phead = HEAD_ADDR_VSTRING (startpos);
4588
4589 /* Find the head of multibyte form. */
4590 PREV_CHAR_BOUNDARY (p, phead);
4591 range += p0 - 1 - p;
4592 if (range > 0)
4593 break;
4594
4595 startpos -= p0 - 1 - p;
4596 }
4597 }
4598 }
4599 return -1;
4600 } /* re_search_2 */
4601 WEAK_ALIAS (__re_search_2, re_search_2)
4602 \f
4603 /* Declarations and macros for re_match_2. */
4604
4605 static int bcmp_translate _RE_ARGS((re_char *s1, re_char *s2,
4606 register int len,
4607 RE_TRANSLATE_TYPE translate,
4608 const int multibyte));
4609
4610 /* This converts PTR, a pointer into one of the search strings `string1'
4611 and `string2' into an offset from the beginning of that string. */
4612 #define POINTER_TO_OFFSET(ptr) \
4613 (FIRST_STRING_P (ptr) \
4614 ? ((regoff_t) ((ptr) - string1)) \
4615 : ((regoff_t) ((ptr) - string2 + size1)))
4616
4617 /* Call before fetching a character with *d. This switches over to
4618 string2 if necessary.
4619 Check re_match_2_internal for a discussion of why end_match_2 might
4620 not be within string2 (but be equal to end_match_1 instead). */
4621 #define PREFETCH() \
4622 while (d == dend) \
4623 { \
4624 /* End of string2 => fail. */ \
4625 if (dend == end_match_2) \
4626 goto fail; \
4627 /* End of string1 => advance to string2. */ \
4628 d = string2; \
4629 dend = end_match_2; \
4630 }
4631
4632 /* Call before fetching a char with *d if you already checked other limits.
4633 This is meant for use in lookahead operations like wordend, etc..
4634 where we might need to look at parts of the string that might be
4635 outside of the LIMITs (i.e past `stop'). */
4636 #define PREFETCH_NOLIMIT() \
4637 if (d == end1) \
4638 { \
4639 d = string2; \
4640 dend = end_match_2; \
4641 } \
4642
4643 /* Test if at very beginning or at very end of the virtual concatenation
4644 of `string1' and `string2'. If only one string, it's `string2'. */
4645 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4646 #define AT_STRINGS_END(d) ((d) == end2)
4647
4648
4649 /* Test if D points to a character which is word-constituent. We have
4650 two special cases to check for: if past the end of string1, look at
4651 the first character in string2; and if before the beginning of
4652 string2, look at the last character in string1. */
4653 #define WORDCHAR_P(d) \
4654 (SYNTAX ((d) == end1 ? *string2 \
4655 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
4656 == Sword)
4657
4658 /* Disabled due to a compiler bug -- see comment at case wordbound */
4659
4660 /* The comment at case wordbound is following one, but we don't use
4661 AT_WORD_BOUNDARY anymore to support multibyte form.
4662
4663 The DEC Alpha C compiler 3.x generates incorrect code for the
4664 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of
4665 AT_WORD_BOUNDARY, so this code is disabled. Expanding the
4666 macro and introducing temporary variables works around the bug. */
4667
4668 #if 0
4669 /* Test if the character before D and the one at D differ with respect
4670 to being word-constituent. */
4671 #define AT_WORD_BOUNDARY(d) \
4672 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
4673 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
4674 #endif
4675
4676 /* Free everything we malloc. */
4677 #ifdef MATCH_MAY_ALLOCATE
4678 # define FREE_VAR(var) if (var) { REGEX_FREE (var); var = NULL; } else
4679 # define FREE_VARIABLES() \
4680 do { \
4681 REGEX_FREE_STACK (fail_stack.stack); \
4682 FREE_VAR (regstart); \
4683 FREE_VAR (regend); \
4684 FREE_VAR (best_regstart); \
4685 FREE_VAR (best_regend); \
4686 } while (0)
4687 #else
4688 # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4689 #endif /* not MATCH_MAY_ALLOCATE */
4690
4691 \f
4692 /* Optimization routines. */
4693
4694 /* If the operation is a match against one or more chars,
4695 return a pointer to the next operation, else return NULL. */
4696 static re_char *
4697 skip_one_char (const re_char *p)
4698 {
4699 switch (SWITCH_ENUM_CAST (*p++))
4700 {
4701 case anychar:
4702 break;
4703
4704 case exactn:
4705 p += *p + 1;
4706 break;
4707
4708 case charset_not:
4709 case charset:
4710 if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
4711 {
4712 int mcnt;
4713 p = CHARSET_RANGE_TABLE (p - 1);
4714 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4715 p = CHARSET_RANGE_TABLE_END (p, mcnt);
4716 }
4717 else
4718 p += 1 + CHARSET_BITMAP_SIZE (p - 1);
4719 break;
4720
4721 case syntaxspec:
4722 case notsyntaxspec:
4723 #ifdef emacs
4724 case categoryspec:
4725 case notcategoryspec:
4726 #endif /* emacs */
4727 p++;
4728 break;
4729
4730 default:
4731 p = NULL;
4732 }
4733 return p;
4734 }
4735
4736
4737 /* Jump over non-matching operations. */
4738 static re_char *
4739 skip_noops (const re_char *p, const re_char *pend)
4740 {
4741 int mcnt;
4742 while (p < pend)
4743 {
4744 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p))
4745 {
4746 case start_memory:
4747 case stop_memory:
4748 p += 2; break;
4749 case no_op:
4750 p += 1; break;
4751 case jump:
4752 p += 1;
4753 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4754 p += mcnt;
4755 break;
4756 default:
4757 return p;
4758 }
4759 }
4760 assert (p == pend);
4761 return p;
4762 }
4763
4764 /* Non-zero if "p1 matches something" implies "p2 fails". */
4765 static int
4766 mutually_exclusive_p (struct re_pattern_buffer *bufp, const re_char *p1, const re_char *p2)
4767 {
4768 re_opcode_t op2;
4769 const boolean multibyte = RE_MULTIBYTE_P (bufp);
4770 unsigned char *pend = bufp->buffer + bufp->used;
4771
4772 assert (p1 >= bufp->buffer && p1 < pend
4773 && p2 >= bufp->buffer && p2 <= pend);
4774
4775 /* Skip over open/close-group commands.
4776 If what follows this loop is a ...+ construct,
4777 look at what begins its body, since we will have to
4778 match at least one of that. */
4779 p2 = skip_noops (p2, pend);
4780 /* The same skip can be done for p1, except that this function
4781 is only used in the case where p1 is a simple match operator. */
4782 /* p1 = skip_noops (p1, pend); */
4783
4784 assert (p1 >= bufp->buffer && p1 < pend
4785 && p2 >= bufp->buffer && p2 <= pend);
4786
4787 op2 = p2 == pend ? succeed : *p2;
4788
4789 switch (SWITCH_ENUM_CAST (op2))
4790 {
4791 case succeed:
4792 case endbuf:
4793 /* If we're at the end of the pattern, we can change. */
4794 if (skip_one_char (p1))
4795 {
4796 DEBUG_PRINT1 (" End of pattern: fast loop.\n");
4797 return 1;
4798 }
4799 break;
4800
4801 case endline:
4802 case exactn:
4803 {
4804 register re_wchar_t c
4805 = (re_opcode_t) *p2 == endline ? '\n'
4806 : RE_STRING_CHAR (p2 + 2, multibyte);
4807
4808 if ((re_opcode_t) *p1 == exactn)
4809 {
4810 if (c != RE_STRING_CHAR (p1 + 2, multibyte))
4811 {
4812 DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]);
4813 return 1;
4814 }
4815 }
4816
4817 else if ((re_opcode_t) *p1 == charset
4818 || (re_opcode_t) *p1 == charset_not)
4819 {
4820 int not = (re_opcode_t) *p1 == charset_not;
4821
4822 /* Test if C is listed in charset (or charset_not)
4823 at `p1'. */
4824 if (! multibyte || IS_REAL_ASCII (c))
4825 {
4826 if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH
4827 && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4828 not = !not;
4829 }
4830 else if (CHARSET_RANGE_TABLE_EXISTS_P (p1))
4831 CHARSET_LOOKUP_RANGE_TABLE (not, c, p1);
4832
4833 /* `not' is equal to 1 if c would match, which means
4834 that we can't change to pop_failure_jump. */
4835 if (!not)
4836 {
4837 DEBUG_PRINT1 (" No match => fast loop.\n");
4838 return 1;
4839 }
4840 }
4841 else if ((re_opcode_t) *p1 == anychar
4842 && c == '\n')
4843 {
4844 DEBUG_PRINT1 (" . != \\n => fast loop.\n");
4845 return 1;
4846 }
4847 }
4848 break;
4849
4850 case charset:
4851 {
4852 if ((re_opcode_t) *p1 == exactn)
4853 /* Reuse the code above. */
4854 return mutually_exclusive_p (bufp, p2, p1);
4855
4856 /* It is hard to list up all the character in charset
4857 P2 if it includes multibyte character. Give up in
4858 such case. */
4859 else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
4860 {
4861 /* Now, we are sure that P2 has no range table.
4862 So, for the size of bitmap in P2, `p2[1]' is
4863 enough. But P1 may have range table, so the
4864 size of bitmap table of P1 is extracted by
4865 using macro `CHARSET_BITMAP_SIZE'.
4866
4867 In a multibyte case, we know that all the character
4868 listed in P2 is ASCII. In a unibyte case, P1 has only a
4869 bitmap table. So, in both cases, it is enough to test
4870 only the bitmap table of P1. */
4871
4872 if ((re_opcode_t) *p1 == charset)
4873 {
4874 int idx;
4875 /* We win if the charset inside the loop
4876 has no overlap with the one after the loop. */
4877 for (idx = 0;
4878 (idx < (int) p2[1]
4879 && idx < CHARSET_BITMAP_SIZE (p1));
4880 idx++)
4881 if ((p2[2 + idx] & p1[2 + idx]) != 0)
4882 break;
4883
4884 if (idx == p2[1]
4885 || idx == CHARSET_BITMAP_SIZE (p1))
4886 {
4887 DEBUG_PRINT1 (" No match => fast loop.\n");
4888 return 1;
4889 }
4890 }
4891 else if ((re_opcode_t) *p1 == charset_not)
4892 {
4893 int idx;
4894 /* We win if the charset_not inside the loop lists
4895 every character listed in the charset after. */
4896 for (idx = 0; idx < (int) p2[1]; idx++)
4897 if (! (p2[2 + idx] == 0
4898 || (idx < CHARSET_BITMAP_SIZE (p1)
4899 && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
4900 break;
4901
4902 if (idx == p2[1])
4903 {
4904 DEBUG_PRINT1 (" No match => fast loop.\n");
4905 return 1;
4906 }
4907 }
4908 }
4909 }
4910 break;
4911
4912 case charset_not:
4913 switch (SWITCH_ENUM_CAST (*p1))
4914 {
4915 case exactn:
4916 case charset:
4917 /* Reuse the code above. */
4918 return mutually_exclusive_p (bufp, p2, p1);
4919 case charset_not:
4920 /* When we have two charset_not, it's very unlikely that
4921 they don't overlap. The union of the two sets of excluded
4922 chars should cover all possible chars, which, as a matter of
4923 fact, is virtually impossible in multibyte buffers. */
4924 break;
4925 }
4926 break;
4927
4928 case wordend:
4929 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == Sword);
4930 case symend:
4931 return ((re_opcode_t) *p1 == syntaxspec
4932 && (p1[1] == Ssymbol || p1[1] == Sword));
4933 case notsyntaxspec:
4934 return ((re_opcode_t) *p1 == syntaxspec && p1[1] == p2[1]);
4935
4936 case wordbeg:
4937 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == Sword);
4938 case symbeg:
4939 return ((re_opcode_t) *p1 == notsyntaxspec
4940 && (p1[1] == Ssymbol || p1[1] == Sword));
4941 case syntaxspec:
4942 return ((re_opcode_t) *p1 == notsyntaxspec && p1[1] == p2[1]);
4943
4944 case wordbound:
4945 return (((re_opcode_t) *p1 == notsyntaxspec
4946 || (re_opcode_t) *p1 == syntaxspec)
4947 && p1[1] == Sword);
4948
4949 #ifdef emacs
4950 case categoryspec:
4951 return ((re_opcode_t) *p1 == notcategoryspec && p1[1] == p2[1]);
4952 case notcategoryspec:
4953 return ((re_opcode_t) *p1 == categoryspec && p1[1] == p2[1]);
4954 #endif /* emacs */
4955
4956 default:
4957 ;
4958 }
4959
4960 /* Safe default. */
4961 return 0;
4962 }
4963
4964 \f
4965 /* Matching routines. */
4966
4967 #ifndef emacs /* Emacs never uses this. */
4968 /* re_match is like re_match_2 except it takes only a single string. */
4969
4970 int
4971 re_match (struct re_pattern_buffer *bufp, const char *string,
4972 int size, int pos, struct re_registers *regs)
4973 {
4974 int result = re_match_2_internal (bufp, NULL, 0, (re_char*) string, size,
4975 pos, regs, size);
4976 return result;
4977 }
4978 WEAK_ALIAS (__re_match, re_match)
4979 #endif /* not emacs */
4980
4981 #ifdef emacs
4982 /* In Emacs, this is the string or buffer in which we
4983 are matching. It is used for looking up syntax properties. */
4984 Lisp_Object re_match_object;
4985 #endif
4986
4987 /* re_match_2 matches the compiled pattern in BUFP against the
4988 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
4989 and SIZE2, respectively). We start matching at POS, and stop
4990 matching at STOP.
4991
4992 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4993 store offsets for the substring each group matched in REGS. See the
4994 documentation for exactly how many groups we fill.
4995
4996 We return -1 if no match, -2 if an internal error (such as the
4997 failure stack overflowing). Otherwise, we return the length of the
4998 matched substring. */
4999
5000 int
5001 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int size1, const char *string2, int size2, int pos, struct re_registers *regs, int stop)
5002 {
5003 int result;
5004
5005 #ifdef emacs
5006 int charpos;
5007 gl_state.object = re_match_object; /* Used by SYNTAX_TABLE_BYTE_TO_CHAR. */
5008 charpos = SYNTAX_TABLE_BYTE_TO_CHAR (POS_AS_IN_BUFFER (pos));
5009 SETUP_SYNTAX_TABLE_FOR_OBJECT (re_match_object, charpos, 1);
5010 #endif
5011
5012 result = re_match_2_internal (bufp, (re_char*) string1, size1,
5013 (re_char*) string2, size2,
5014 pos, regs, stop);
5015 return result;
5016 }
5017 WEAK_ALIAS (__re_match_2, re_match_2)
5018
5019
5020 /* This is a separate function so that we can force an alloca cleanup
5021 afterwards. */
5022 static int
5023 re_match_2_internal (struct re_pattern_buffer *bufp, const re_char *string1, int size1, const re_char *string2, int size2, int pos, struct re_registers *regs, int stop)
5024 {
5025 /* General temporaries. */
5026 int mcnt;
5027 size_t reg;
5028 boolean not;
5029
5030 /* Just past the end of the corresponding string. */
5031 re_char *end1, *end2;
5032
5033 /* Pointers into string1 and string2, just past the last characters in
5034 each to consider matching. */
5035 re_char *end_match_1, *end_match_2;
5036
5037 /* Where we are in the data, and the end of the current string. */
5038 re_char *d, *dend;
5039
5040 /* Used sometimes to remember where we were before starting matching
5041 an operator so that we can go back in case of failure. This "atomic"
5042 behavior of matching opcodes is indispensable to the correctness
5043 of the on_failure_keep_string_jump optimization. */
5044 re_char *dfail;
5045
5046 /* Where we are in the pattern, and the end of the pattern. */
5047 re_char *p = bufp->buffer;
5048 re_char *pend = p + bufp->used;
5049
5050 /* We use this to map every character in the string. */
5051 RE_TRANSLATE_TYPE translate = bufp->translate;
5052
5053 /* Nonzero if BUFP is setup from a multibyte regex. */
5054 const boolean multibyte = RE_MULTIBYTE_P (bufp);
5055
5056 /* Nonzero if STRING1/STRING2 are multibyte. */
5057 const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp);
5058
5059 /* Failure point stack. Each place that can handle a failure further
5060 down the line pushes a failure point on this stack. It consists of
5061 regstart, and regend for all registers corresponding to
5062 the subexpressions we're currently inside, plus the number of such
5063 registers, and, finally, two char *'s. The first char * is where
5064 to resume scanning the pattern; the second one is where to resume
5065 scanning the strings. */
5066 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
5067 fail_stack_type fail_stack;
5068 #endif
5069 #ifdef DEBUG
5070 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
5071 #endif
5072
5073 #if defined REL_ALLOC && defined REGEX_MALLOC
5074 /* This holds the pointer to the failure stack, when
5075 it is allocated relocatably. */
5076 fail_stack_elt_t *failure_stack_ptr;
5077 #endif
5078
5079 /* We fill all the registers internally, independent of what we
5080 return, for use in backreferences. The number here includes
5081 an element for register zero. */
5082 size_t num_regs = bufp->re_nsub + 1;
5083
5084 /* Information on the contents of registers. These are pointers into
5085 the input strings; they record just what was matched (on this
5086 attempt) by a subexpression part of the pattern, that is, the
5087 regnum-th regstart pointer points to where in the pattern we began
5088 matching and the regnum-th regend points to right after where we
5089 stopped matching the regnum-th subexpression. (The zeroth register
5090 keeps track of what the whole pattern matches.) */
5091 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5092 re_char **regstart, **regend;
5093 #endif
5094
5095 /* The following record the register info as found in the above
5096 variables when we find a match better than any we've seen before.
5097 This happens as we backtrack through the failure points, which in
5098 turn happens only if we have not yet matched the entire string. */
5099 unsigned best_regs_set = false;
5100 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
5101 re_char **best_regstart, **best_regend;
5102 #endif
5103
5104 /* Logically, this is `best_regend[0]'. But we don't want to have to
5105 allocate space for that if we're not allocating space for anything
5106 else (see below). Also, we never need info about register 0 for
5107 any of the other register vectors, and it seems rather a kludge to
5108 treat `best_regend' differently than the rest. So we keep track of
5109 the end of the best match so far in a separate variable. We
5110 initialize this to NULL so that when we backtrack the first time
5111 and need to test it, it's not garbage. */
5112 re_char *match_end = NULL;
5113
5114 #ifdef DEBUG
5115 /* Counts the total number of registers pushed. */
5116 unsigned num_regs_pushed = 0;
5117 #endif
5118
5119 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
5120
5121 INIT_FAIL_STACK ();
5122
5123 #ifdef MATCH_MAY_ALLOCATE
5124 /* Do not bother to initialize all the register variables if there are
5125 no groups in the pattern, as it takes a fair amount of time. If
5126 there are groups, we include space for register 0 (the whole
5127 pattern), even though we never use it, since it simplifies the
5128 array indexing. We should fix this. */
5129 if (bufp->re_nsub)
5130 {
5131 regstart = REGEX_TALLOC (num_regs, re_char *);
5132 regend = REGEX_TALLOC (num_regs, re_char *);
5133 best_regstart = REGEX_TALLOC (num_regs, re_char *);
5134 best_regend = REGEX_TALLOC (num_regs, re_char *);
5135
5136 if (!(regstart && regend && best_regstart && best_regend))
5137 {
5138 FREE_VARIABLES ();
5139 return -2;
5140 }
5141 }
5142 else
5143 {
5144 /* We must initialize all our variables to NULL, so that
5145 `FREE_VARIABLES' doesn't try to free them. */
5146 regstart = regend = best_regstart = best_regend = NULL;
5147 }
5148 #endif /* MATCH_MAY_ALLOCATE */
5149
5150 /* The starting position is bogus. */
5151 if (pos < 0 || pos > size1 + size2)
5152 {
5153 FREE_VARIABLES ();
5154 return -1;
5155 }
5156
5157 /* Initialize subexpression text positions to -1 to mark ones that no
5158 start_memory/stop_memory has been seen for. Also initialize the
5159 register information struct. */
5160 for (reg = 1; reg < num_regs; reg++)
5161 regstart[reg] = regend[reg] = NULL;
5162
5163 /* We move `string1' into `string2' if the latter's empty -- but not if
5164 `string1' is null. */
5165 if (size2 == 0 && string1 != NULL)
5166 {
5167 string2 = string1;
5168 size2 = size1;
5169 string1 = 0;
5170 size1 = 0;
5171 }
5172 end1 = string1 + size1;
5173 end2 = string2 + size2;
5174
5175 /* `p' scans through the pattern as `d' scans through the data.
5176 `dend' is the end of the input string that `d' points within. `d'
5177 is advanced into the following input string whenever necessary, but
5178 this happens before fetching; therefore, at the beginning of the
5179 loop, `d' can be pointing at the end of a string, but it cannot
5180 equal `string2'. */
5181 if (pos >= size1)
5182 {
5183 /* Only match within string2. */
5184 d = string2 + pos - size1;
5185 dend = end_match_2 = string2 + stop - size1;
5186 end_match_1 = end1; /* Just to give it a value. */
5187 }
5188 else
5189 {
5190 if (stop < size1)
5191 {
5192 /* Only match within string1. */
5193 end_match_1 = string1 + stop;
5194 /* BEWARE!
5195 When we reach end_match_1, PREFETCH normally switches to string2.
5196 But in the present case, this means that just doing a PREFETCH
5197 makes us jump from `stop' to `gap' within the string.
5198 What we really want here is for the search to stop as
5199 soon as we hit end_match_1. That's why we set end_match_2
5200 to end_match_1 (since PREFETCH fails as soon as we hit
5201 end_match_2). */
5202 end_match_2 = end_match_1;
5203 }
5204 else
5205 { /* It's important to use this code when stop == size so that
5206 moving `d' from end1 to string2 will not prevent the d == dend
5207 check from catching the end of string. */
5208 end_match_1 = end1;
5209 end_match_2 = string2 + stop - size1;
5210 }
5211 d = string1 + pos;
5212 dend = end_match_1;
5213 }
5214
5215 DEBUG_PRINT1 ("The compiled pattern is: ");
5216 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
5217 DEBUG_PRINT1 ("The string to match is: `");
5218 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
5219 DEBUG_PRINT1 ("'\n");
5220
5221 /* This loops over pattern commands. It exits by returning from the
5222 function if the match is complete, or it drops through if the match
5223 fails at this starting point in the input data. */
5224 for (;;)
5225 {
5226 DEBUG_PRINT2 ("\n%p: ", p);
5227
5228 if (p == pend)
5229 { /* End of pattern means we might have succeeded. */
5230 DEBUG_PRINT1 ("end of pattern ... ");
5231
5232 /* If we haven't matched the entire string, and we want the
5233 longest match, try backtracking. */
5234 if (d != end_match_2)
5235 {
5236 /* 1 if this match ends in the same string (string1 or string2)
5237 as the best previous match. */
5238 boolean same_str_p = (FIRST_STRING_P (match_end)
5239 == FIRST_STRING_P (d));
5240 /* 1 if this match is the best seen so far. */
5241 boolean best_match_p;
5242
5243 /* AIX compiler got confused when this was combined
5244 with the previous declaration. */
5245 if (same_str_p)
5246 best_match_p = d > match_end;
5247 else
5248 best_match_p = !FIRST_STRING_P (d);
5249
5250 DEBUG_PRINT1 ("backtracking.\n");
5251
5252 if (!FAIL_STACK_EMPTY ())
5253 { /* More failure points to try. */
5254
5255 /* If exceeds best match so far, save it. */
5256 if (!best_regs_set || best_match_p)
5257 {
5258 best_regs_set = true;
5259 match_end = d;
5260
5261 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
5262
5263 for (reg = 1; reg < num_regs; reg++)
5264 {
5265 best_regstart[reg] = regstart[reg];
5266 best_regend[reg] = regend[reg];
5267 }
5268 }
5269 goto fail;
5270 }
5271
5272 /* If no failure points, don't restore garbage. And if
5273 last match is real best match, don't restore second
5274 best one. */
5275 else if (best_regs_set && !best_match_p)
5276 {
5277 restore_best_regs:
5278 /* Restore best match. It may happen that `dend ==
5279 end_match_1' while the restored d is in string2.
5280 For example, the pattern `x.*y.*z' against the
5281 strings `x-' and `y-z-', if the two strings are
5282 not consecutive in memory. */
5283 DEBUG_PRINT1 ("Restoring best registers.\n");
5284
5285 d = match_end;
5286 dend = ((d >= string1 && d <= end1)
5287 ? end_match_1 : end_match_2);
5288
5289 for (reg = 1; reg < num_regs; reg++)
5290 {
5291 regstart[reg] = best_regstart[reg];
5292 regend[reg] = best_regend[reg];
5293 }
5294 }
5295 } /* d != end_match_2 */
5296
5297 succeed_label:
5298 DEBUG_PRINT1 ("Accepting match.\n");
5299
5300 /* If caller wants register contents data back, do it. */
5301 if (regs && !bufp->no_sub)
5302 {
5303 /* Have the register data arrays been allocated? */
5304 if (bufp->regs_allocated == REGS_UNALLOCATED)
5305 { /* No. So allocate them with malloc. We need one
5306 extra element beyond `num_regs' for the `-1' marker
5307 GNU code uses. */
5308 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
5309 regs->start = TALLOC (regs->num_regs, regoff_t);
5310 regs->end = TALLOC (regs->num_regs, regoff_t);
5311 if (regs->start == NULL || regs->end == NULL)
5312 {
5313 FREE_VARIABLES ();
5314 return -2;
5315 }
5316 bufp->regs_allocated = REGS_REALLOCATE;
5317 }
5318 else if (bufp->regs_allocated == REGS_REALLOCATE)
5319 { /* Yes. If we need more elements than were already
5320 allocated, reallocate them. If we need fewer, just
5321 leave it alone. */
5322 if (regs->num_regs < num_regs + 1)
5323 {
5324 regs->num_regs = num_regs + 1;
5325 RETALLOC (regs->start, regs->num_regs, regoff_t);
5326 RETALLOC (regs->end, regs->num_regs, regoff_t);
5327 if (regs->start == NULL || regs->end == NULL)
5328 {
5329 FREE_VARIABLES ();
5330 return -2;
5331 }
5332 }
5333 }
5334 else
5335 {
5336 /* These braces fend off a "empty body in an else-statement"
5337 warning under GCC when assert expands to nothing. */
5338 assert (bufp->regs_allocated == REGS_FIXED);
5339 }
5340
5341 /* Convert the pointer data in `regstart' and `regend' to
5342 indices. Register zero has to be set differently,
5343 since we haven't kept track of any info for it. */
5344 if (regs->num_regs > 0)
5345 {
5346 regs->start[0] = pos;
5347 regs->end[0] = POINTER_TO_OFFSET (d);
5348 }
5349
5350 /* Go through the first `min (num_regs, regs->num_regs)'
5351 registers, since that is all we initialized. */
5352 for (reg = 1; reg < MIN (num_regs, regs->num_regs); reg++)
5353 {
5354 if (REG_UNSET (regstart[reg]) || REG_UNSET (regend[reg]))
5355 regs->start[reg] = regs->end[reg] = -1;
5356 else
5357 {
5358 regs->start[reg]
5359 = (regoff_t) POINTER_TO_OFFSET (regstart[reg]);
5360 regs->end[reg]
5361 = (regoff_t) POINTER_TO_OFFSET (regend[reg]);
5362 }
5363 }
5364
5365 /* If the regs structure we return has more elements than
5366 were in the pattern, set the extra elements to -1. If
5367 we (re)allocated the registers, this is the case,
5368 because we always allocate enough to have at least one
5369 -1 at the end. */
5370 for (reg = num_regs; reg < regs->num_regs; reg++)
5371 regs->start[reg] = regs->end[reg] = -1;
5372 } /* regs && !bufp->no_sub */
5373
5374 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
5375 nfailure_points_pushed, nfailure_points_popped,
5376 nfailure_points_pushed - nfailure_points_popped);
5377 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
5378
5379 mcnt = POINTER_TO_OFFSET (d) - pos;
5380
5381 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
5382
5383 FREE_VARIABLES ();
5384 return mcnt;
5385 }
5386
5387 /* Otherwise match next pattern command. */
5388 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
5389 {
5390 /* Ignore these. Used to ignore the n of succeed_n's which
5391 currently have n == 0. */
5392 case no_op:
5393 DEBUG_PRINT1 ("EXECUTING no_op.\n");
5394 break;
5395
5396 case succeed:
5397 DEBUG_PRINT1 ("EXECUTING succeed.\n");
5398 goto succeed_label;
5399
5400 /* Match the next n pattern characters exactly. The following
5401 byte in the pattern defines n, and the n bytes after that
5402 are the characters to match. */
5403 case exactn:
5404 mcnt = *p++;
5405 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
5406
5407 /* Remember the start point to rollback upon failure. */
5408 dfail = d;
5409
5410 #ifndef emacs
5411 /* This is written out as an if-else so we don't waste time
5412 testing `translate' inside the loop. */
5413 if (RE_TRANSLATE_P (translate))
5414 do
5415 {
5416 PREFETCH ();
5417 if (RE_TRANSLATE (translate, *d) != *p++)
5418 {
5419 d = dfail;
5420 goto fail;
5421 }
5422 d++;
5423 }
5424 while (--mcnt);
5425 else
5426 do
5427 {
5428 PREFETCH ();
5429 if (*d++ != *p++)
5430 {
5431 d = dfail;
5432 goto fail;
5433 }
5434 }
5435 while (--mcnt);
5436 #else /* emacs */
5437 /* The cost of testing `translate' is comparatively small. */
5438 if (target_multibyte)
5439 do
5440 {
5441 int pat_charlen, buf_charlen;
5442 int pat_ch, buf_ch;
5443
5444 PREFETCH ();
5445 if (multibyte)
5446 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
5447 else
5448 {
5449 pat_ch = RE_CHAR_TO_MULTIBYTE (*p);
5450 pat_charlen = 1;
5451 }
5452 buf_ch = STRING_CHAR_AND_LENGTH (d, buf_charlen);
5453
5454 if (TRANSLATE (buf_ch) != pat_ch)
5455 {
5456 d = dfail;
5457 goto fail;
5458 }
5459
5460 p += pat_charlen;
5461 d += buf_charlen;
5462 mcnt -= pat_charlen;
5463 }
5464 while (mcnt > 0);
5465 else
5466 do
5467 {
5468 int pat_charlen, buf_charlen;
5469 int pat_ch, buf_ch;
5470
5471 PREFETCH ();
5472 if (multibyte)
5473 {
5474 pat_ch = STRING_CHAR_AND_LENGTH (p, pat_charlen);
5475 pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch);
5476 }
5477 else
5478 {
5479 pat_ch = *p;
5480 pat_charlen = 1;
5481 }
5482 buf_ch = RE_CHAR_TO_MULTIBYTE (*d);
5483 if (! CHAR_BYTE8_P (buf_ch))
5484 {
5485 buf_ch = TRANSLATE (buf_ch);
5486 buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch);
5487 if (buf_ch < 0)
5488 buf_ch = *d;
5489 }
5490 else
5491 buf_ch = *d;
5492 if (buf_ch != pat_ch)
5493 {
5494 d = dfail;
5495 goto fail;
5496 }
5497 p += pat_charlen;
5498 d++;
5499 }
5500 while (--mcnt);
5501 #endif
5502 break;
5503
5504
5505 /* Match any character except possibly a newline or a null. */
5506 case anychar:
5507 {
5508 int buf_charlen;
5509 re_wchar_t buf_ch;
5510
5511 DEBUG_PRINT1 ("EXECUTING anychar.\n");
5512
5513 PREFETCH ();
5514 buf_ch = RE_STRING_CHAR_AND_LENGTH (d, buf_charlen,
5515 target_multibyte);
5516 buf_ch = TRANSLATE (buf_ch);
5517
5518 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5519 && buf_ch == '\n')
5520 || ((bufp->syntax & RE_DOT_NOT_NULL)
5521 && buf_ch == '\000'))
5522 goto fail;
5523
5524 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
5525 d += buf_charlen;
5526 }
5527 break;
5528
5529
5530 case charset:
5531 case charset_not:
5532 {
5533 register unsigned int c;
5534 boolean not = (re_opcode_t) *(p - 1) == charset_not;
5535 int len;
5536
5537 /* Start of actual range_table, or end of bitmap if there is no
5538 range table. */
5539 re_char *range_table;
5540
5541 /* Nonzero if there is a range table. */
5542 int range_table_exists;
5543
5544 /* Number of ranges of range table. This is not included
5545 in the initial byte-length of the command. */
5546 int count = 0;
5547
5548 /* Whether matching against a unibyte character. */
5549 boolean unibyte_char = false;
5550
5551 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
5552
5553 range_table_exists = CHARSET_RANGE_TABLE_EXISTS_P (&p[-1]);
5554
5555 if (range_table_exists)
5556 {
5557 range_table = CHARSET_RANGE_TABLE (&p[-1]); /* Past the bitmap. */
5558 EXTRACT_NUMBER_AND_INCR (count, range_table);
5559 }
5560
5561 PREFETCH ();
5562 c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
5563 if (target_multibyte)
5564 {
5565 int c1;
5566
5567 c = TRANSLATE (c);
5568 c1 = RE_CHAR_TO_UNIBYTE (c);
5569 if (c1 >= 0)
5570 {
5571 unibyte_char = true;
5572 c = c1;
5573 }
5574 }
5575 else
5576 {
5577 int c1 = RE_CHAR_TO_MULTIBYTE (c);
5578
5579 if (! CHAR_BYTE8_P (c1))
5580 {
5581 c1 = TRANSLATE (c1);
5582 c1 = RE_CHAR_TO_UNIBYTE (c1);
5583 if (c1 >= 0)
5584 {
5585 unibyte_char = true;
5586 c = c1;
5587 }
5588 }
5589 else
5590 unibyte_char = true;
5591 }
5592
5593 if (unibyte_char && c < (1 << BYTEWIDTH))
5594 { /* Lookup bitmap. */
5595 /* Cast to `unsigned' instead of `unsigned char' in
5596 case the bit list is a full 32 bytes long. */
5597 if (c < (unsigned) (CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH)
5598 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
5599 not = !not;
5600 }
5601 #ifdef emacs
5602 else if (range_table_exists)
5603 {
5604 int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]);
5605
5606 if ( (class_bits & BIT_LOWER && ISLOWER (c))
5607 | (class_bits & BIT_MULTIBYTE)
5608 | (class_bits & BIT_PUNCT && ISPUNCT (c))
5609 | (class_bits & BIT_SPACE && ISSPACE (c))
5610 | (class_bits & BIT_UPPER && ISUPPER (c))
5611 | (class_bits & BIT_WORD && ISWORD (c)))
5612 not = !not;
5613 else
5614 CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count);
5615 }
5616 #endif /* emacs */
5617
5618 if (range_table_exists)
5619 p = CHARSET_RANGE_TABLE_END (range_table, count);
5620 else
5621 p += CHARSET_BITMAP_SIZE (&p[-1]) + 1;
5622
5623 if (!not) goto fail;
5624
5625 d += len;
5626 break;
5627 }
5628
5629
5630 /* The beginning of a group is represented by start_memory.
5631 The argument is the register number. The text
5632 matched within the group is recorded (in the internal
5633 registers data structure) under the register number. */
5634 case start_memory:
5635 DEBUG_PRINT2 ("EXECUTING start_memory %d:\n", *p);
5636
5637 /* In case we need to undo this operation (via backtracking). */
5638 PUSH_FAILURE_REG ((unsigned int)*p);
5639
5640 regstart[*p] = d;
5641 regend[*p] = NULL; /* probably unnecessary. -sm */
5642 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
5643
5644 /* Move past the register number and inner group count. */
5645 p += 1;
5646 break;
5647
5648
5649 /* The stop_memory opcode represents the end of a group. Its
5650 argument is the same as start_memory's: the register number. */
5651 case stop_memory:
5652 DEBUG_PRINT2 ("EXECUTING stop_memory %d:\n", *p);
5653
5654 assert (!REG_UNSET (regstart[*p]));
5655 /* Strictly speaking, there should be code such as:
5656
5657 assert (REG_UNSET (regend[*p]));
5658 PUSH_FAILURE_REGSTOP ((unsigned int)*p);
5659
5660 But the only info to be pushed is regend[*p] and it is known to
5661 be UNSET, so there really isn't anything to push.
5662 Not pushing anything, on the other hand deprives us from the
5663 guarantee that regend[*p] is UNSET since undoing this operation
5664 will not reset its value properly. This is not important since
5665 the value will only be read on the next start_memory or at
5666 the very end and both events can only happen if this stop_memory
5667 is *not* undone. */
5668
5669 regend[*p] = d;
5670 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
5671
5672 /* Move past the register number and the inner group count. */
5673 p += 1;
5674 break;
5675
5676
5677 /* \<digit> has been turned into a `duplicate' command which is
5678 followed by the numeric value of <digit> as the register number. */
5679 case duplicate:
5680 {
5681 register re_char *d2, *dend2;
5682 int regno = *p++; /* Get which register to match against. */
5683 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
5684
5685 /* Can't back reference a group which we've never matched. */
5686 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
5687 goto fail;
5688
5689 /* Where in input to try to start matching. */
5690 d2 = regstart[regno];
5691
5692 /* Remember the start point to rollback upon failure. */
5693 dfail = d;
5694
5695 /* Where to stop matching; if both the place to start and
5696 the place to stop matching are in the same string, then
5697 set to the place to stop, otherwise, for now have to use
5698 the end of the first string. */
5699
5700 dend2 = ((FIRST_STRING_P (regstart[regno])
5701 == FIRST_STRING_P (regend[regno]))
5702 ? regend[regno] : end_match_1);
5703 for (;;)
5704 {
5705 /* If necessary, advance to next segment in register
5706 contents. */
5707 while (d2 == dend2)
5708 {
5709 if (dend2 == end_match_2) break;
5710 if (dend2 == regend[regno]) break;
5711
5712 /* End of string1 => advance to string2. */
5713 d2 = string2;
5714 dend2 = regend[regno];
5715 }
5716 /* At end of register contents => success */
5717 if (d2 == dend2) break;
5718
5719 /* If necessary, advance to next segment in data. */
5720 PREFETCH ();
5721
5722 /* How many characters left in this segment to match. */
5723 mcnt = dend - d;
5724
5725 /* Want how many consecutive characters we can match in
5726 one shot, so, if necessary, adjust the count. */
5727 if (mcnt > dend2 - d2)
5728 mcnt = dend2 - d2;
5729
5730 /* Compare that many; failure if mismatch, else move
5731 past them. */
5732 if (RE_TRANSLATE_P (translate)
5733 ? bcmp_translate (d, d2, mcnt, translate, target_multibyte)
5734 : memcmp (d, d2, mcnt))
5735 {
5736 d = dfail;
5737 goto fail;
5738 }
5739 d += mcnt, d2 += mcnt;
5740 }
5741 }
5742 break;
5743
5744
5745 /* begline matches the empty string at the beginning of the string
5746 (unless `not_bol' is set in `bufp'), and after newlines. */
5747 case begline:
5748 DEBUG_PRINT1 ("EXECUTING begline.\n");
5749
5750 if (AT_STRINGS_BEG (d))
5751 {
5752 if (!bufp->not_bol) break;
5753 }
5754 else
5755 {
5756 unsigned c;
5757 GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2);
5758 if (c == '\n')
5759 break;
5760 }
5761 /* In all other cases, we fail. */
5762 goto fail;
5763
5764
5765 /* endline is the dual of begline. */
5766 case endline:
5767 DEBUG_PRINT1 ("EXECUTING endline.\n");
5768
5769 if (AT_STRINGS_END (d))
5770 {
5771 if (!bufp->not_eol) break;
5772 }
5773 else
5774 {
5775 PREFETCH_NOLIMIT ();
5776 if (*d == '\n')
5777 break;
5778 }
5779 goto fail;
5780
5781
5782 /* Match at the very beginning of the data. */
5783 case begbuf:
5784 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
5785 if (AT_STRINGS_BEG (d))
5786 break;
5787 goto fail;
5788
5789
5790 /* Match at the very end of the data. */
5791 case endbuf:
5792 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
5793 if (AT_STRINGS_END (d))
5794 break;
5795 goto fail;
5796
5797
5798 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
5799 pushes NULL as the value for the string on the stack. Then
5800 `POP_FAILURE_POINT' will keep the current value for the
5801 string, instead of restoring it. To see why, consider
5802 matching `foo\nbar' against `.*\n'. The .* matches the foo;
5803 then the . fails against the \n. But the next thing we want
5804 to do is match the \n against the \n; if we restored the
5805 string value, we would be back at the foo.
5806
5807 Because this is used only in specific cases, we don't need to
5808 check all the things that `on_failure_jump' does, to make
5809 sure the right things get saved on the stack. Hence we don't
5810 share its code. The only reason to push anything on the
5811 stack at all is that otherwise we would have to change
5812 `anychar's code to do something besides goto fail in this
5813 case; that seems worse than this. */
5814 case on_failure_keep_string_jump:
5815 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5816 DEBUG_PRINT3 ("EXECUTING on_failure_keep_string_jump %d (to %p):\n",
5817 mcnt, p + mcnt);
5818
5819 PUSH_FAILURE_POINT (p - 3, NULL);
5820 break;
5821
5822 /* A nasty loop is introduced by the non-greedy *? and +?.
5823 With such loops, the stack only ever contains one failure point
5824 at a time, so that a plain on_failure_jump_loop kind of
5825 cycle detection cannot work. Worse yet, such a detection
5826 can not only fail to detect a cycle, but it can also wrongly
5827 detect a cycle (between different instantiations of the same
5828 loop).
5829 So the method used for those nasty loops is a little different:
5830 We use a special cycle-detection-stack-frame which is pushed
5831 when the on_failure_jump_nastyloop failure-point is *popped*.
5832 This special frame thus marks the beginning of one iteration
5833 through the loop and we can hence easily check right here
5834 whether something matched between the beginning and the end of
5835 the loop. */
5836 case on_failure_jump_nastyloop:
5837 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5838 DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
5839 mcnt, p + mcnt);
5840
5841 assert ((re_opcode_t)p[-4] == no_op);
5842 {
5843 int cycle = 0;
5844 CHECK_INFINITE_LOOP (p - 4, d);
5845 if (!cycle)
5846 /* If there's a cycle, just continue without pushing
5847 this failure point. The failure point is the "try again"
5848 option, which shouldn't be tried.
5849 We want (x?)*?y\1z to match both xxyz and xxyxz. */
5850 PUSH_FAILURE_POINT (p - 3, d);
5851 }
5852 break;
5853
5854 /* Simple loop detecting on_failure_jump: just check on the
5855 failure stack if the same spot was already hit earlier. */
5856 case on_failure_jump_loop:
5857 on_failure:
5858 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5859 DEBUG_PRINT3 ("EXECUTING on_failure_jump_loop %d (to %p):\n",
5860 mcnt, p + mcnt);
5861 {
5862 int cycle = 0;
5863 CHECK_INFINITE_LOOP (p - 3, d);
5864 if (cycle)
5865 /* If there's a cycle, get out of the loop, as if the matching
5866 had failed. We used to just `goto fail' here, but that was
5867 aborting the search a bit too early: we want to keep the
5868 empty-loop-match and keep matching after the loop.
5869 We want (x?)*y\1z to match both xxyz and xxyxz. */
5870 p += mcnt;
5871 else
5872 PUSH_FAILURE_POINT (p - 3, d);
5873 }
5874 break;
5875
5876
5877 /* Uses of on_failure_jump:
5878
5879 Each alternative starts with an on_failure_jump that points
5880 to the beginning of the next alternative. Each alternative
5881 except the last ends with a jump that in effect jumps past
5882 the rest of the alternatives. (They really jump to the
5883 ending jump of the following alternative, because tensioning
5884 these jumps is a hassle.)
5885
5886 Repeats start with an on_failure_jump that points past both
5887 the repetition text and either the following jump or
5888 pop_failure_jump back to this on_failure_jump. */
5889 case on_failure_jump:
5890 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5891 DEBUG_PRINT3 ("EXECUTING on_failure_jump %d (to %p):\n",
5892 mcnt, p + mcnt);
5893
5894 PUSH_FAILURE_POINT (p -3, d);
5895 break;
5896
5897 /* This operation is used for greedy *.
5898 Compare the beginning of the repeat with what in the
5899 pattern follows its end. If we can establish that there
5900 is nothing that they would both match, i.e., that we
5901 would have to backtrack because of (as in, e.g., `a*a')
5902 then we can use a non-backtracking loop based on
5903 on_failure_keep_string_jump instead of on_failure_jump. */
5904 case on_failure_jump_smart:
5905 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5906 DEBUG_PRINT3 ("EXECUTING on_failure_jump_smart %d (to %p).\n",
5907 mcnt, p + mcnt);
5908 {
5909 re_char *p1 = p; /* Next operation. */
5910 /* Here, we discard `const', making re_match non-reentrant. */
5911 unsigned char *p2 = (unsigned char*) p + mcnt; /* Jump dest. */
5912 unsigned char *p3 = (unsigned char*) p - 3; /* opcode location. */
5913
5914 p -= 3; /* Reset so that we will re-execute the
5915 instruction once it's been changed. */
5916
5917 EXTRACT_NUMBER (mcnt, p2 - 2);
5918
5919 /* Ensure this is a indeed the trivial kind of loop
5920 we are expecting. */
5921 assert (skip_one_char (p1) == p2 - 3);
5922 assert ((re_opcode_t) p2[-3] == jump && p2 + mcnt == p);
5923 DEBUG_STATEMENT (debug += 2);
5924 if (mutually_exclusive_p (bufp, p1, p2))
5925 {
5926 /* Use a fast `on_failure_keep_string_jump' loop. */
5927 DEBUG_PRINT1 (" smart exclusive => fast loop.\n");
5928 *p3 = (unsigned char) on_failure_keep_string_jump;
5929 STORE_NUMBER (p2 - 2, mcnt + 3);
5930 }
5931 else
5932 {
5933 /* Default to a safe `on_failure_jump' loop. */
5934 DEBUG_PRINT1 (" smart default => slow loop.\n");
5935 *p3 = (unsigned char) on_failure_jump;
5936 }
5937 DEBUG_STATEMENT (debug -= 2);
5938 }
5939 break;
5940
5941 /* Unconditionally jump (without popping any failure points). */
5942 case jump:
5943 unconditional_jump:
5944 IMMEDIATE_QUIT_CHECK;
5945 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
5946 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
5947 p += mcnt; /* Do the jump. */
5948 DEBUG_PRINT2 ("(to %p).\n", p);
5949 break;
5950
5951
5952 /* Have to succeed matching what follows at least n times.
5953 After that, handle like `on_failure_jump'. */
5954 case succeed_n:
5955 /* Signedness doesn't matter since we only compare MCNT to 0. */
5956 EXTRACT_NUMBER (mcnt, p + 2);
5957 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
5958
5959 /* Originally, mcnt is how many times we HAVE to succeed. */
5960 if (mcnt != 0)
5961 {
5962 /* Here, we discard `const', making re_match non-reentrant. */
5963 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
5964 mcnt--;
5965 p += 4;
5966 PUSH_NUMBER (p2, mcnt);
5967 }
5968 else
5969 /* The two bytes encoding mcnt == 0 are two no_op opcodes. */
5970 goto on_failure;
5971 break;
5972
5973 case jump_n:
5974 /* Signedness doesn't matter since we only compare MCNT to 0. */
5975 EXTRACT_NUMBER (mcnt, p + 2);
5976 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
5977
5978 /* Originally, this is how many times we CAN jump. */
5979 if (mcnt != 0)
5980 {
5981 /* Here, we discard `const', making re_match non-reentrant. */
5982 unsigned char *p2 = (unsigned char*) p + 2; /* counter loc. */
5983 mcnt--;
5984 PUSH_NUMBER (p2, mcnt);
5985 goto unconditional_jump;
5986 }
5987 /* If don't have to jump any more, skip over the rest of command. */
5988 else
5989 p += 4;
5990 break;
5991
5992 case set_number_at:
5993 {
5994 unsigned char *p2; /* Location of the counter. */
5995 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
5996
5997 EXTRACT_NUMBER_AND_INCR (mcnt, p);
5998 /* Here, we discard `const', making re_match non-reentrant. */
5999 p2 = (unsigned char*) p + mcnt;
6000 /* Signedness doesn't matter since we only copy MCNT's bits . */
6001 EXTRACT_NUMBER_AND_INCR (mcnt, p);
6002 DEBUG_PRINT3 (" Setting %p to %d.\n", p2, mcnt);
6003 PUSH_NUMBER (p2, mcnt);
6004 break;
6005 }
6006
6007 case wordbound:
6008 case notwordbound:
6009 not = (re_opcode_t) *(p - 1) == notwordbound;
6010 DEBUG_PRINT2 ("EXECUTING %swordbound.\n", not?"not":"");
6011
6012 /* We SUCCEED (or FAIL) in one of the following cases: */
6013
6014 /* Case 1: D is at the beginning or the end of string. */
6015 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d))
6016 not = !not;
6017 else
6018 {
6019 /* C1 is the character before D, S1 is the syntax of C1, C2
6020 is the character at D, and S2 is the syntax of C2. */
6021 re_wchar_t c1, c2;
6022 int s1, s2;
6023 int dummy;
6024 #ifdef emacs
6025 int offset = PTR_TO_OFFSET (d - 1);
6026 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6027 UPDATE_SYNTAX_TABLE (charpos);
6028 #endif
6029 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6030 s1 = SYNTAX (c1);
6031 #ifdef emacs
6032 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
6033 #endif
6034 PREFETCH_NOLIMIT ();
6035 GET_CHAR_AFTER (c2, d, dummy);
6036 s2 = SYNTAX (c2);
6037
6038 if (/* Case 2: Only one of S1 and S2 is Sword. */
6039 ((s1 == Sword) != (s2 == Sword))
6040 /* Case 3: Both of S1 and S2 are Sword, and macro
6041 WORD_BOUNDARY_P (C1, C2) returns nonzero. */
6042 || ((s1 == Sword) && WORD_BOUNDARY_P (c1, c2)))
6043 not = !not;
6044 }
6045 if (not)
6046 break;
6047 else
6048 goto fail;
6049
6050 case wordbeg:
6051 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
6052
6053 /* We FAIL in one of the following cases: */
6054
6055 /* Case 1: D is at the end of string. */
6056 if (AT_STRINGS_END (d))
6057 goto fail;
6058 else
6059 {
6060 /* C1 is the character before D, S1 is the syntax of C1, C2
6061 is the character at D, and S2 is the syntax of C2. */
6062 re_wchar_t c1, c2;
6063 int s1, s2;
6064 int dummy;
6065 #ifdef emacs
6066 int offset = PTR_TO_OFFSET (d);
6067 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6068 UPDATE_SYNTAX_TABLE (charpos);
6069 #endif
6070 PREFETCH ();
6071 GET_CHAR_AFTER (c2, d, dummy);
6072 s2 = SYNTAX (c2);
6073
6074 /* Case 2: S2 is not Sword. */
6075 if (s2 != Sword)
6076 goto fail;
6077
6078 /* Case 3: D is not at the beginning of string ... */
6079 if (!AT_STRINGS_BEG (d))
6080 {
6081 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6082 #ifdef emacs
6083 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6084 #endif
6085 s1 = SYNTAX (c1);
6086
6087 /* ... and S1 is Sword, and WORD_BOUNDARY_P (C1, C2)
6088 returns 0. */
6089 if ((s1 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6090 goto fail;
6091 }
6092 }
6093 break;
6094
6095 case wordend:
6096 DEBUG_PRINT1 ("EXECUTING wordend.\n");
6097
6098 /* We FAIL in one of the following cases: */
6099
6100 /* Case 1: D is at the beginning of string. */
6101 if (AT_STRINGS_BEG (d))
6102 goto fail;
6103 else
6104 {
6105 /* C1 is the character before D, S1 is the syntax of C1, C2
6106 is the character at D, and S2 is the syntax of C2. */
6107 re_wchar_t c1, c2;
6108 int s1, s2;
6109 int dummy;
6110 #ifdef emacs
6111 int offset = PTR_TO_OFFSET (d) - 1;
6112 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6113 UPDATE_SYNTAX_TABLE (charpos);
6114 #endif
6115 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6116 s1 = SYNTAX (c1);
6117
6118 /* Case 2: S1 is not Sword. */
6119 if (s1 != Sword)
6120 goto fail;
6121
6122 /* Case 3: D is not at the end of string ... */
6123 if (!AT_STRINGS_END (d))
6124 {
6125 PREFETCH_NOLIMIT ();
6126 GET_CHAR_AFTER (c2, d, dummy);
6127 #ifdef emacs
6128 UPDATE_SYNTAX_TABLE_FORWARD (charpos);
6129 #endif
6130 s2 = SYNTAX (c2);
6131
6132 /* ... and S2 is Sword, and WORD_BOUNDARY_P (C1, C2)
6133 returns 0. */
6134 if ((s2 == Sword) && !WORD_BOUNDARY_P (c1, c2))
6135 goto fail;
6136 }
6137 }
6138 break;
6139
6140 case symbeg:
6141 DEBUG_PRINT1 ("EXECUTING symbeg.\n");
6142
6143 /* We FAIL in one of the following cases: */
6144
6145 /* Case 1: D is at the end of string. */
6146 if (AT_STRINGS_END (d))
6147 goto fail;
6148 else
6149 {
6150 /* C1 is the character before D, S1 is the syntax of C1, C2
6151 is the character at D, and S2 is the syntax of C2. */
6152 re_wchar_t c1, c2;
6153 int s1, s2;
6154 #ifdef emacs
6155 int offset = PTR_TO_OFFSET (d);
6156 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6157 UPDATE_SYNTAX_TABLE (charpos);
6158 #endif
6159 PREFETCH ();
6160 c2 = RE_STRING_CHAR (d, target_multibyte);
6161 s2 = SYNTAX (c2);
6162
6163 /* Case 2: S2 is neither Sword nor Ssymbol. */
6164 if (s2 != Sword && s2 != Ssymbol)
6165 goto fail;
6166
6167 /* Case 3: D is not at the beginning of string ... */
6168 if (!AT_STRINGS_BEG (d))
6169 {
6170 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6171 #ifdef emacs
6172 UPDATE_SYNTAX_TABLE_BACKWARD (charpos - 1);
6173 #endif
6174 s1 = SYNTAX (c1);
6175
6176 /* ... and S1 is Sword or Ssymbol. */
6177 if (s1 == Sword || s1 == Ssymbol)
6178 goto fail;
6179 }
6180 }
6181 break;
6182
6183 case symend:
6184 DEBUG_PRINT1 ("EXECUTING symend.\n");
6185
6186 /* We FAIL in one of the following cases: */
6187
6188 /* Case 1: D is at the beginning of string. */
6189 if (AT_STRINGS_BEG (d))
6190 goto fail;
6191 else
6192 {
6193 /* C1 is the character before D, S1 is the syntax of C1, C2
6194 is the character at D, and S2 is the syntax of C2. */
6195 re_wchar_t c1, c2;
6196 int s1, s2;
6197 #ifdef emacs
6198 int offset = PTR_TO_OFFSET (d) - 1;
6199 int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6200 UPDATE_SYNTAX_TABLE (charpos);
6201 #endif
6202 GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2);
6203 s1 = SYNTAX (c1);
6204
6205 /* Case 2: S1 is neither Ssymbol nor Sword. */
6206 if (s1 != Sword && s1 != Ssymbol)
6207 goto fail;
6208
6209 /* Case 3: D is not at the end of string ... */
6210 if (!AT_STRINGS_END (d))
6211 {
6212 PREFETCH_NOLIMIT ();
6213 c2 = RE_STRING_CHAR (d, target_multibyte);
6214 #ifdef emacs
6215 UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1);
6216 #endif
6217 s2 = SYNTAX (c2);
6218
6219 /* ... and S2 is Sword or Ssymbol. */
6220 if (s2 == Sword || s2 == Ssymbol)
6221 goto fail;
6222 }
6223 }
6224 break;
6225
6226 case syntaxspec:
6227 case notsyntaxspec:
6228 not = (re_opcode_t) *(p - 1) == notsyntaxspec;
6229 mcnt = *p++;
6230 DEBUG_PRINT3 ("EXECUTING %ssyntaxspec %d.\n", not?"not":"", mcnt);
6231 PREFETCH ();
6232 #ifdef emacs
6233 {
6234 int offset = PTR_TO_OFFSET (d);
6235 int pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset);
6236 UPDATE_SYNTAX_TABLE (pos1);
6237 }
6238 #endif
6239 {
6240 int len;
6241 re_wchar_t c;
6242
6243 GET_CHAR_AFTER (c, d, len);
6244 if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not)
6245 goto fail;
6246 d += len;
6247 }
6248 break;
6249
6250 #ifdef emacs
6251 case before_dot:
6252 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
6253 if (PTR_BYTE_POS (d) >= PT_BYTE)
6254 goto fail;
6255 break;
6256
6257 case at_dot:
6258 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
6259 if (PTR_BYTE_POS (d) != PT_BYTE)
6260 goto fail;
6261 break;
6262
6263 case after_dot:
6264 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
6265 if (PTR_BYTE_POS (d) <= PT_BYTE)
6266 goto fail;
6267 break;
6268
6269 case categoryspec:
6270 case notcategoryspec:
6271 not = (re_opcode_t) *(p - 1) == notcategoryspec;
6272 mcnt = *p++;
6273 DEBUG_PRINT3 ("EXECUTING %scategoryspec %d.\n", not?"not":"", mcnt);
6274 PREFETCH ();
6275 {
6276 int len;
6277 re_wchar_t c;
6278
6279 GET_CHAR_AFTER (c, d, len);
6280 if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not)
6281 goto fail;
6282 d += len;
6283 }
6284 break;
6285
6286 #endif /* emacs */
6287
6288 default:
6289 abort ();
6290 }
6291 continue; /* Successfully executed one pattern command; keep going. */
6292
6293
6294 /* We goto here if a matching operation fails. */
6295 fail:
6296 IMMEDIATE_QUIT_CHECK;
6297 if (!FAIL_STACK_EMPTY ())
6298 {
6299 re_char *str, *pat;
6300 /* A restart point is known. Restore to that state. */
6301 DEBUG_PRINT1 ("\nFAIL:\n");
6302 POP_FAILURE_POINT (str, pat);
6303 switch (SWITCH_ENUM_CAST ((re_opcode_t) *pat++))
6304 {
6305 case on_failure_keep_string_jump:
6306 assert (str == NULL);
6307 goto continue_failure_jump;
6308
6309 case on_failure_jump_nastyloop:
6310 assert ((re_opcode_t)pat[-2] == no_op);
6311 PUSH_FAILURE_POINT (pat - 2, str);
6312 /* Fallthrough */
6313
6314 case on_failure_jump_loop:
6315 case on_failure_jump:
6316 case succeed_n:
6317 d = str;
6318 continue_failure_jump:
6319 EXTRACT_NUMBER_AND_INCR (mcnt, pat);
6320 p = pat + mcnt;
6321 break;
6322
6323 case no_op:
6324 /* A special frame used for nastyloops. */
6325 goto fail;
6326
6327 default:
6328 abort();
6329 }
6330
6331 assert (p >= bufp->buffer && p <= pend);
6332
6333 if (d >= string1 && d <= end1)
6334 dend = end_match_1;
6335 }
6336 else
6337 break; /* Matching at this starting point really fails. */
6338 } /* for (;;) */
6339
6340 if (best_regs_set)
6341 goto restore_best_regs;
6342
6343 FREE_VARIABLES ();
6344
6345 return -1; /* Failure to match. */
6346 } /* re_match_2 */
6347 \f
6348 /* Subroutine definitions for re_match_2. */
6349
6350 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6351 bytes; nonzero otherwise. */
6352
6353 static int
6354 bcmp_translate (const re_char *s1, const re_char *s2, register int len,
6355 RE_TRANSLATE_TYPE translate, const int target_multibyte)
6356 {
6357 register re_char *p1 = s1, *p2 = s2;
6358 re_char *p1_end = s1 + len;
6359 re_char *p2_end = s2 + len;
6360
6361 /* FIXME: Checking both p1 and p2 presumes that the two strings might have
6362 different lengths, but relying on a single `len' would break this. -sm */
6363 while (p1 < p1_end && p2 < p2_end)
6364 {
6365 int p1_charlen, p2_charlen;
6366 re_wchar_t p1_ch, p2_ch;
6367
6368 GET_CHAR_AFTER (p1_ch, p1, p1_charlen);
6369 GET_CHAR_AFTER (p2_ch, p2, p2_charlen);
6370
6371 if (RE_TRANSLATE (translate, p1_ch)
6372 != RE_TRANSLATE (translate, p2_ch))
6373 return 1;
6374
6375 p1 += p1_charlen, p2 += p2_charlen;
6376 }
6377
6378 if (p1 != p1_end || p2 != p2_end)
6379 return 1;
6380
6381 return 0;
6382 }
6383 \f
6384 /* Entry points for GNU code. */
6385
6386 /* re_compile_pattern is the GNU regular expression compiler: it
6387 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6388 Returns 0 if the pattern was valid, otherwise an error string.
6389
6390 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6391 are set in BUFP on entry.
6392
6393 We call regex_compile to do the actual compilation. */
6394
6395 const char *
6396 re_compile_pattern (const char *pattern, size_t length, struct re_pattern_buffer *bufp)
6397 {
6398 reg_errcode_t ret;
6399
6400 /* GNU code is written to assume at least RE_NREGS registers will be set
6401 (and at least one extra will be -1). */
6402 bufp->regs_allocated = REGS_UNALLOCATED;
6403
6404 /* And GNU code determines whether or not to get register information
6405 by passing null for the REGS argument to re_match, etc., not by
6406 setting no_sub. */
6407 bufp->no_sub = 0;
6408
6409 ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);
6410
6411 if (!ret)
6412 return NULL;
6413 return gettext (re_error_msgid[(int) ret]);
6414 }
6415 WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)
6416 \f
6417 /* Entry points compatible with 4.2 BSD regex library. We don't define
6418 them unless specifically requested. */
6419
6420 #if defined _REGEX_RE_COMP || defined _LIBC
6421
6422 /* BSD has one and only one pattern buffer. */
6423 static struct re_pattern_buffer re_comp_buf;
6424
6425 char *
6426 # ifdef _LIBC
6427 /* Make these definitions weak in libc, so POSIX programs can redefine
6428 these names if they don't use our functions, and still use
6429 regcomp/regexec below without link errors. */
6430 weak_function
6431 # endif
6432 re_comp (s)
6433 const char *s;
6434 {
6435 reg_errcode_t ret;
6436
6437 if (!s)
6438 {
6439 if (!re_comp_buf.buffer)
6440 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6441 return (char *) gettext ("No previous regular expression");
6442 return 0;
6443 }
6444
6445 if (!re_comp_buf.buffer)
6446 {
6447 re_comp_buf.buffer = (unsigned char *) malloc (200);
6448 if (re_comp_buf.buffer == NULL)
6449 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6450 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6451 re_comp_buf.allocated = 200;
6452
6453 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
6454 if (re_comp_buf.fastmap == NULL)
6455 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6456 return (char *) gettext (re_error_msgid[(int) REG_ESPACE]);
6457 }
6458
6459 /* Since `re_exec' always passes NULL for the `regs' argument, we
6460 don't need to initialize the pattern buffer fields which affect it. */
6461
6462 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
6463
6464 if (!ret)
6465 return NULL;
6466
6467 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6468 return (char *) gettext (re_error_msgid[(int) ret]);
6469 }
6470
6471
6472 int
6473 # ifdef _LIBC
6474 weak_function
6475 # endif
6476 re_exec (s)
6477 const char *s;
6478 {
6479 const int len = strlen (s);
6480 return
6481 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
6482 }
6483 #endif /* _REGEX_RE_COMP */
6484 \f
6485 /* POSIX.2 functions. Don't define these for Emacs. */
6486
6487 #ifndef emacs
6488
6489 /* regcomp takes a regular expression as a string and compiles it.
6490
6491 PREG is a regex_t *. We do not expect any fields to be initialized,
6492 since POSIX says we shouldn't. Thus, we set
6493
6494 `buffer' to the compiled pattern;
6495 `used' to the length of the compiled pattern;
6496 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6497 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6498 RE_SYNTAX_POSIX_BASIC;
6499 `fastmap' to an allocated space for the fastmap;
6500 `fastmap_accurate' to zero;
6501 `re_nsub' to the number of subexpressions in PATTERN.
6502
6503 PATTERN is the address of the pattern string.
6504
6505 CFLAGS is a series of bits which affect compilation.
6506
6507 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6508 use POSIX basic syntax.
6509
6510 If REG_NEWLINE is set, then . and [^...] don't match newline.
6511 Also, regexec will try a match beginning after every newline.
6512
6513 If REG_ICASE is set, then we considers upper- and lowercase
6514 versions of letters to be equivalent when matching.
6515
6516 If REG_NOSUB is set, then when PREG is passed to regexec, that
6517 routine will report only success or failure, and nothing about the
6518 registers.
6519
6520 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6521 the return codes and their meanings.) */
6522
6523 int
6524 regcomp (regex_t *__restrict preg, const char *__restrict pattern,
6525 int cflags)
6526 {
6527 reg_errcode_t ret;
6528 reg_syntax_t syntax
6529 = (cflags & REG_EXTENDED) ?
6530 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6531
6532 /* regex_compile will allocate the space for the compiled pattern. */
6533 preg->buffer = 0;
6534 preg->allocated = 0;
6535 preg->used = 0;
6536
6537 /* Try to allocate space for the fastmap. */
6538 preg->fastmap = (char *) malloc (1 << BYTEWIDTH);
6539
6540 if (cflags & REG_ICASE)
6541 {
6542 unsigned i;
6543
6544 preg->translate
6545 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE
6546 * sizeof (*(RE_TRANSLATE_TYPE)0));
6547 if (preg->translate == NULL)
6548 return (int) REG_ESPACE;
6549
6550 /* Map uppercase characters to corresponding lowercase ones. */
6551 for (i = 0; i < CHAR_SET_SIZE; i++)
6552 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i;
6553 }
6554 else
6555 preg->translate = NULL;
6556
6557 /* If REG_NEWLINE is set, newlines are treated differently. */
6558 if (cflags & REG_NEWLINE)
6559 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
6560 syntax &= ~RE_DOT_NEWLINE;
6561 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6562 }
6563 else
6564 syntax |= RE_NO_NEWLINE_ANCHOR;
6565
6566 preg->no_sub = !!(cflags & REG_NOSUB);
6567
6568 /* POSIX says a null character in the pattern terminates it, so we
6569 can use strlen here in compiling the pattern. */
6570 ret = regex_compile ((re_char*) pattern, strlen (pattern), syntax, preg);
6571
6572 /* POSIX doesn't distinguish between an unmatched open-group and an
6573 unmatched close-group: both are REG_EPAREN. */
6574 if (ret == REG_ERPAREN)
6575 ret = REG_EPAREN;
6576
6577 if (ret == REG_NOERROR && preg->fastmap)
6578 { /* Compute the fastmap now, since regexec cannot modify the pattern
6579 buffer. */
6580 re_compile_fastmap (preg);
6581 if (preg->can_be_null)
6582 { /* The fastmap can't be used anyway. */
6583 free (preg->fastmap);
6584 preg->fastmap = NULL;
6585 }
6586 }
6587 return (int) ret;
6588 }
6589 WEAK_ALIAS (__regcomp, regcomp)
6590
6591
6592 /* regexec searches for a given pattern, specified by PREG, in the
6593 string STRING.
6594
6595 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6596 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6597 least NMATCH elements, and we set them to the offsets of the
6598 corresponding matched substrings.
6599
6600 EFLAGS specifies `execution flags' which affect matching: if
6601 REG_NOTBOL is set, then ^ does not match at the beginning of the
6602 string; if REG_NOTEOL is set, then $ does not match at the end.
6603
6604 We return 0 if we find a match and REG_NOMATCH if not. */
6605
6606 int
6607 regexec (const regex_t *__restrict preg, const char *__restrict string,
6608 size_t nmatch, regmatch_t pmatch[__restrict_arr], int eflags)
6609 {
6610 int ret;
6611 struct re_registers regs;
6612 regex_t private_preg;
6613 int len = strlen (string);
6614 boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;
6615
6616 private_preg = *preg;
6617
6618 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6619 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6620
6621 /* The user has told us exactly how many registers to return
6622 information about, via `nmatch'. We have to pass that on to the
6623 matching routines. */
6624 private_preg.regs_allocated = REGS_FIXED;
6625
6626 if (want_reg_info)
6627 {
6628 regs.num_regs = nmatch;
6629 regs.start = TALLOC (nmatch * 2, regoff_t);
6630 if (regs.start == NULL)
6631 return (int) REG_NOMATCH;
6632 regs.end = regs.start + nmatch;
6633 }
6634
6635 /* Instead of using not_eol to implement REG_NOTEOL, we could simply
6636 pass (&private_preg, string, len + 1, 0, len, ...) pretending the string
6637 was a little bit longer but still only matching the real part.
6638 This works because the `endline' will check for a '\n' and will find a
6639 '\0', correctly deciding that this is not the end of a line.
6640 But it doesn't work out so nicely for REG_NOTBOL, since we don't have
6641 a convenient '\0' there. For all we know, the string could be preceded
6642 by '\n' which would throw things off. */
6643
6644 /* Perform the searching operation. */
6645 ret = re_search (&private_preg, string, len,
6646 /* start: */ 0, /* range: */ len,
6647 want_reg_info ? &regs : (struct re_registers *) 0);
6648
6649 /* Copy the register information to the POSIX structure. */
6650 if (want_reg_info)
6651 {
6652 if (ret >= 0)
6653 {
6654 unsigned r;
6655
6656 for (r = 0; r < nmatch; r++)
6657 {
6658 pmatch[r].rm_so = regs.start[r];
6659 pmatch[r].rm_eo = regs.end[r];
6660 }
6661 }
6662
6663 /* If we needed the temporary register info, free the space now. */
6664 free (regs.start);
6665 }
6666
6667 /* We want zero return to mean success, unlike `re_search'. */
6668 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
6669 }
6670 WEAK_ALIAS (__regexec, regexec)
6671
6672
6673 /* Returns a message corresponding to an error code, ERR_CODE, returned
6674 from either regcomp or regexec. We don't use PREG here.
6675
6676 ERR_CODE was previously called ERRCODE, but that name causes an
6677 error with msvc8 compiler. */
6678
6679 size_t
6680 regerror (int err_code, const regex_t *preg, char *errbuf, size_t errbuf_size)
6681 {
6682 const char *msg;
6683 size_t msg_size;
6684
6685 if (err_code < 0
6686 || err_code >= (sizeof (re_error_msgid) / sizeof (re_error_msgid[0])))
6687 /* Only error codes returned by the rest of the code should be passed
6688 to this routine. If we are given anything else, or if other regex
6689 code generates an invalid error code, then the program has a bug.
6690 Dump core so we can fix it. */
6691 abort ();
6692
6693 msg = gettext (re_error_msgid[err_code]);
6694
6695 msg_size = strlen (msg) + 1; /* Includes the null. */
6696
6697 if (errbuf_size != 0)
6698 {
6699 if (msg_size > errbuf_size)
6700 {
6701 strncpy (errbuf, msg, errbuf_size - 1);
6702 errbuf[errbuf_size - 1] = 0;
6703 }
6704 else
6705 strcpy (errbuf, msg);
6706 }
6707
6708 return msg_size;
6709 }
6710 WEAK_ALIAS (__regerror, regerror)
6711
6712
6713 /* Free dynamically allocated space used by PREG. */
6714
6715 void
6716 regfree (regex_t *preg)
6717 {
6718 free (preg->buffer);
6719 preg->buffer = NULL;
6720
6721 preg->allocated = 0;
6722 preg->used = 0;
6723
6724 free (preg->fastmap);
6725 preg->fastmap = NULL;
6726 preg->fastmap_accurate = 0;
6727
6728 free (preg->translate);
6729 preg->translate = NULL;
6730 }
6731 WEAK_ALIAS (__regfree, regfree)
6732
6733 #endif /* not emacs */