X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/4fc35edd5fcdfe258c04cfed707753fdd8795a72..63750fd4ed4ff8bb9b3ff8868d4e36e3422adb21:/src/regex.c diff --git a/src/regex.c b/src/regex.c index 41fe3fa808..f92bcb7923 100644 --- a/src/regex.c +++ b/src/regex.c @@ -2,7 +2,7 @@ 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the internationalization features.) - Copyright (C) 1993-2015 Free Software Foundation, Inc. + Copyright (C) 1993-2016 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -215,7 +215,7 @@ xmalloc (size_t size) void *val = malloc (size); if (!val && size) { - write (2, "virtual memory exhausted\n", 25); + write (STDERR_FILENO, "virtual memory exhausted\n", 25); exit (1); } return val; @@ -233,7 +233,7 @@ xrealloc (void *block, size_t size) val = realloc (block, size); if (!val && size) { - write (2, "virtual memory exhausted\n", 25); + write (STDERR_FILENO, "virtual memory exhausted\n", 25); exit (1); } return val; @@ -313,23 +313,23 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; /* The rest must handle multibyte characters. */ # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ - ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \ - : 1) + ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \ + : graphicp (c)) # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ ? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \ - : 1) + : printablep (c)) # define ISALNUM(c) (IS_REAL_ASCII (c) \ ? (((c) >= 'a' && (c) <= 'z') \ || ((c) >= 'A' && (c) <= 'Z') \ || ((c) >= '0' && (c) <= '9')) \ - : SYNTAX (c) == Sword) + : (alphabeticp (c) || decimalnump (c))) # define ISALPHA(c) (IS_REAL_ASCII (c) \ ? (((c) >= 'a' && (c) <= 'z') \ || ((c) >= 'A' && (c) <= 'Z')) \ - : SYNTAX (c) == Sword) + : alphabeticp (c)) # define ISLOWER(c) lowercasep (c) @@ -1197,13 +1197,6 @@ print_double_string (re_char *where, re_char *string1, ssize_t size1, #endif /* not DEBUG */ -/* Use this to suppress gcc's `...may be used before initialized' warnings. */ -#ifdef lint -# define IF_LINT(Code) Code -#else -# define IF_LINT(Code) /* empty */ -#endif - /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can also be assigned to arbitrarily: each pattern buffer stores its own syntax, so it can be changed between regex compilations. */ @@ -1544,9 +1537,9 @@ do { \ DEBUG_PRINT (" Push frame index: %zd\n", fail_stack.frame); \ PUSH_FAILURE_INT (fail_stack.frame); \ \ - DEBUG_PRINT (" Push string %p: `", string_place); \ + DEBUG_PRINT (" Push string %p: \"", string_place); \ DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, size2);\ - DEBUG_PRINT ("'\n"); \ + DEBUG_PRINT ("\"\n"); \ PUSH_FAILURE_POINTER (string_place); \ \ DEBUG_PRINT (" Push pattern %p: ", pattern); \ @@ -1598,9 +1591,9 @@ do { \ on_failure_keep_string_jump opcode, and we want to throw away the \ saved NULL, thus retaining our current position in the string. */ \ str = POP_FAILURE_POINTER (); \ - DEBUG_PRINT (" Popping string %p: `", str); \ + DEBUG_PRINT (" Popping string %p: \"", str); \ DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ - DEBUG_PRINT ("'\n"); \ + DEBUG_PRINT ("\"\n"); \ \ fail_stack.frame = POP_FAILURE_INT (); \ DEBUG_PRINT (" Popping frame index: %zd\n", fail_stack.frame); \ @@ -1865,13 +1858,18 @@ struct range_table_work_area #define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i]) /* Bits used to implement the multibyte-part of the various character classes - such as [:alnum:] in a charset's range table. */ + such as [:alnum:] in a charset's range table. The code currently assumes + that only the low 16 bits are used. */ #define BIT_WORD 0x1 #define BIT_LOWER 0x2 #define BIT_PUNCT 0x4 #define BIT_SPACE 0x8 #define BIT_UPPER 0x10 #define BIT_MULTIBYTE 0x20 +#define BIT_ALPHA 0x40 +#define BIT_ALNUM 0x80 +#define BIT_GRAPH 0x100 +#define BIT_PRINT 0x200 /* Set the bit for character C in a list. */ @@ -2070,13 +2068,17 @@ re_wctype_to_bit (re_wctype_t cc) { switch (cc) { - case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH: + case RECC_NONASCII: case RECC_MULTIBYTE: return BIT_MULTIBYTE; - case RECC_ALPHA: case RECC_ALNUM: case RECC_WORD: return BIT_WORD; + case RECC_ALPHA: return BIT_ALPHA; + case RECC_ALNUM: return BIT_ALNUM; + case RECC_WORD: return BIT_WORD; case RECC_LOWER: return BIT_LOWER; case RECC_UPPER: return BIT_UPPER; case RECC_PUNCT: return BIT_PUNCT; case RECC_SPACE: return BIT_SPACE; + case RECC_GRAPH: return BIT_GRAPH; + case RECC_PRINT: return BIT_PRINT; case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL: case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0; default: @@ -2463,9 +2465,9 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, /* These hold the values of p, pattern, and pend from the main pattern when we have pushed into a subpattern. */ - re_char *main_p IF_LINT (= NULL); - re_char *main_pattern IF_LINT (= NULL); - re_char *main_pend IF_LINT (= NULL); + re_char *main_p; + re_char *main_pattern; + re_char *main_pend; #ifdef DEBUG debug++; @@ -2930,7 +2932,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, #endif /* emacs */ /* In most cases the matching rule for char classes only uses the syntax table for multibyte chars, - so that the content of the syntax-table it is not + so that the content of the syntax-table is not hardcoded in the range_table. SPACE and WORD are the two exceptions. */ if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) @@ -2945,7 +2947,7 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, p = class_beg; SET_LIST_BIT ('['); - /* Because the `:' may starts the range, we + /* Because the `:' may start the range, we can't simply set bit and repeat the loop. Instead, just set it to C and handle below. */ c = ':'; @@ -5118,9 +5120,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, DEBUG_PRINT ("The compiled pattern is: "); DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); - DEBUG_PRINT ("The string to match is: `"); + DEBUG_PRINT ("The string to match is: \""); DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); - DEBUG_PRINT ("'\n"); + DEBUG_PRINT ("\"\n"); /* This loops over pattern commands. It exits by returning from the function if the match is complete, or it drops through if the match @@ -5131,8 +5133,6 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, if (p == pend) { - ptrdiff_t dcnt; - /* End of pattern means we might have succeeded. */ DEBUG_PRINT ("end of pattern ... "); @@ -5140,19 +5140,22 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, longest match, try backtracking. */ if (d != end_match_2) { - /* 1 if this match ends in the same string (string1 or string2) - as the best previous match. */ - boolean same_str_p = (FIRST_STRING_P (match_end) - == FIRST_STRING_P (d)); - /* 1 if this match is the best seen so far. */ - boolean best_match_p; - - /* AIX compiler got confused when this was combined - with the previous declaration. */ - if (same_str_p) - best_match_p = d > match_end; - else - best_match_p = !FIRST_STRING_P (d); + /* True if this match is the best seen so far. */ + bool best_match_p; + + { + /* True if this match ends in the same string (string1 + or string2) as the best previous match. */ + bool same_str_p = (FIRST_STRING_P (match_end) + == FIRST_STRING_P (d)); + + /* AIX compiler got confused when this was combined + with the previous declaration. */ + if (same_str_p) + best_match_p = d > match_end; + else + best_match_p = !FIRST_STRING_P (d); + } DEBUG_PRINT ("backtracking.\n"); @@ -5281,7 +5284,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, nfailure_points_pushed - nfailure_points_popped); DEBUG_PRINT ("%u registers pushed.\n", num_regs_pushed); - dcnt = POINTER_TO_OFFSET (d) - pos; + ptrdiff_t dcnt = POINTER_TO_OFFSET (d) - pos; DEBUG_PRINT ("Returning %td from re_match_2.\n", dcnt); @@ -5426,7 +5429,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, && buf_ch == '\000')) goto fail; - DEBUG_PRINT (" Matched `%d'.\n", *d); + DEBUG_PRINT (" Matched \"%d\".\n", *d); d += buf_charlen; } break; @@ -5435,13 +5438,13 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, case charset: case charset_not: { - register unsigned int c; + register unsigned int c, corig; boolean not = (re_opcode_t) *(p - 1) == charset_not; int len; /* Start of actual range_table, or end of bitmap if there is no range table. */ - re_char *range_table IF_LINT (= NULL); + re_char *range_table UNINIT; /* Nonzero if there is a range table. */ int range_table_exists; @@ -5464,7 +5467,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, } PREFETCH (); - c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte); + corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte); if (target_multibyte) { int c1; @@ -5508,12 +5511,22 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, { int class_bits = CHARSET_RANGE_TABLE_BITS (&p[-1]); - if ( (class_bits & BIT_LOWER && ISLOWER (c)) + if ( (class_bits & BIT_LOWER + && (ISLOWER (c) + || (corig != c + && c == upcase (corig) && ISUPPER(c)))) | (class_bits & BIT_MULTIBYTE) | (class_bits & BIT_PUNCT && ISPUNCT (c)) | (class_bits & BIT_SPACE && ISSPACE (c)) - | (class_bits & BIT_UPPER && ISUPPER (c)) - | (class_bits & BIT_WORD && ISWORD (c))) + | (class_bits & BIT_UPPER + && (ISUPPER (c) + || (corig != c + && c == downcase (corig) && ISLOWER (c)))) + | (class_bits & BIT_WORD && ISWORD (c)) + | (class_bits & BIT_ALPHA && ISALPHA (c)) + | (class_bits & BIT_ALNUM && ISALNUM (c)) + | (class_bits & BIT_GRAPH && ISGRAPH (c)) + | (class_bits & BIT_PRINT && ISPRINT (c))) not = !not; else CHARSET_LOOKUP_RANGE_TABLE_RAW (not, c, range_table, count); @@ -5932,12 +5945,12 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, #ifdef emacs ssize_t offset = PTR_TO_OFFSET (d - 1); ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE (charpos); + UPDATE_SYNTAX_TABLE_FAST (charpos); #endif GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); s1 = SYNTAX (c1); #ifdef emacs - UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); + UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos + 1); #endif PREFETCH_NOLIMIT (); GET_CHAR_AFTER (c2, d, dummy); @@ -5974,7 +5987,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, #ifdef emacs ssize_t offset = PTR_TO_OFFSET (d); ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE (charpos); + UPDATE_SYNTAX_TABLE_FAST (charpos); #endif PREFETCH (); GET_CHAR_AFTER (c2, d, dummy); @@ -6019,7 +6032,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, #ifdef emacs ssize_t offset = PTR_TO_OFFSET (d) - 1; ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE (charpos); + UPDATE_SYNTAX_TABLE_FAST (charpos); #endif GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); s1 = SYNTAX (c1); @@ -6034,7 +6047,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, PREFETCH_NOLIMIT (); GET_CHAR_AFTER (c2, d, dummy); #ifdef emacs - UPDATE_SYNTAX_TABLE_FORWARD (charpos); + UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos); #endif s2 = SYNTAX (c2); @@ -6063,7 +6076,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, #ifdef emacs ssize_t offset = PTR_TO_OFFSET (d); ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE (charpos); + UPDATE_SYNTAX_TABLE_FAST (charpos); #endif PREFETCH (); c2 = RE_STRING_CHAR (d, target_multibyte); @@ -6106,7 +6119,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, #ifdef emacs ssize_t offset = PTR_TO_OFFSET (d) - 1; ssize_t charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE (charpos); + UPDATE_SYNTAX_TABLE_FAST (charpos); #endif GET_CHAR_BEFORE_2 (c1, d, string1, end1, string2, end2); s1 = SYNTAX (c1); @@ -6121,7 +6134,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, PREFETCH_NOLIMIT (); c2 = RE_STRING_CHAR (d, target_multibyte); #ifdef emacs - UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); + UPDATE_SYNTAX_TABLE_FORWARD_FAST (charpos + 1); #endif s2 = SYNTAX (c2); @@ -6144,7 +6157,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp, const_re_char *string1, { ssize_t offset = PTR_TO_OFFSET (d); ssize_t pos1 = SYNTAX_TABLE_BYTE_TO_CHAR (offset); - UPDATE_SYNTAX_TABLE (pos1); + UPDATE_SYNTAX_TABLE_FAST (pos1); } #endif {