X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/98284ef51c67fa69796946466337d426ab81f9ee..6b985764f07ae164d8142ba64774f2beb2856ca8:/src/bidi.c diff --git a/src/bidi.c b/src/bidi.c index e5e08c6a25..6ea84d9c0c 100644 --- a/src/bidi.c +++ b/src/bidi.c @@ -1,13 +1,13 @@ /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs. - Copyright (C) 2000-2001, 2004-2005, 2009-2015 Free Software + Copyright (C) 2000-2001, 2004-2005, 2009-2016 Free Software Foundation, Inc. This file is part of GNU Emacs. GNU Emacs is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. +the Free Software Foundation, either version 3 of the License, or (at +your option) any later version. GNU Emacs is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -532,7 +532,7 @@ bidi_copy_it (struct bidi_it *to, struct bidi_it *from) /* Copy everything from the start through the active part of the level stack. */ memcpy (to, from, - (offsetof (struct bidi_it, level_stack[1]) + (offsetof (struct bidi_it, level_stack) + sizeof from->level_stack[0] + from->stack_idx * sizeof from->level_stack[0])); } @@ -1107,15 +1107,9 @@ bidi_initialize (void) emacs_abort (); staticpro (&bidi_brackets_table); - DEFSYM (Qparagraph_start, "paragraph-start"); - paragraph_start_re = Fsymbol_value (Qparagraph_start); - if (!STRINGP (paragraph_start_re)) - paragraph_start_re = build_string ("\f\\|[ \t]*$"); + paragraph_start_re = build_string ("^\\(\f\\|[ \t]*\\)$"); staticpro (¶graph_start_re); - DEFSYM (Qparagraph_separate, "paragraph-separate"); - paragraph_separate_re = Fsymbol_value (Qparagraph_separate); - if (!STRINGP (paragraph_separate_re)) - paragraph_separate_re = build_string ("[ \t\f]*$"); + paragraph_separate_re = build_string ("^[ \t\f]*$"); staticpro (¶graph_separate_re); bidi_cache_sp = 0; @@ -1313,13 +1307,13 @@ bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos, /* `(space ...)' display specs are handled as paragraph separators for the purposes of the reordering; see UAX#9 section 3 and clause HL1 in section 4.3 there. */ - ch = 0x2029; + ch = PARAGRAPH_SEPARATOR; } else { /* All other display specs are handled as the Unicode Object Replacement Character. */ - ch = 0xFFFC; + ch = OBJECT_REPLACEMENT_CHARACTER; } disp_end_pos = compute_display_string_end (*disp_pos, string); if (disp_end_pos < 0) @@ -1919,8 +1913,6 @@ bidi_resolve_explicit (struct bidi_it *bidi_it) { eassert (bidi_it->prev.charpos == bidi_it->charpos - 1); prev_type = bidi_it->prev.orig_type; - if (prev_type == FSI) - prev_type = bidi_it->type_after_wn; } } /* Don't move at end of buffer/string. */ @@ -1935,8 +1927,6 @@ bidi_resolve_explicit (struct bidi_it *bidi_it) emacs_abort (); bidi_it->bytepos += bidi_it->ch_len; prev_type = bidi_it->orig_type; - if (prev_type == FSI) - prev_type = bidi_it->type_after_wn; } else /* EOB or end of string */ prev_type = NEUTRAL_B; @@ -2091,10 +2081,17 @@ bidi_resolve_explicit (struct bidi_it *bidi_it) if (typ1 != STRONG_R && typ1 != STRONG_AL) { type = LRI; + /* Override orig_type, which will be needed when we come to + examine the next character, which is the first character + inside the isolate. */ + bidi_it->orig_type = type; goto fsi_as_lri; } else - type = RLI; + { + type = RLI; + bidi_it->orig_type = type; + } /* FALLTHROUGH */ case RLI: /* X5a */ if (override == NEUTRAL_DIR) @@ -2315,7 +2312,31 @@ bidi_resolve_weak (struct bidi_it *bidi_it) if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */ type = WEAK_EN; } - else if (bidi_it->next_en_pos >=0) + else if (type == WEAK_BN + /* This condition is for the following important case: + + . we are at level zero + . either previous strong character was L, + or we've seen no strong characters since sos + and the base paragraph direction is L2R + . this BN is NOT a bidi directional control + + For such a situation, either this BN will be + converted to EN per W5, and then to L by virtue + of W7; or it will become ON per W6, and then L + because of N1/N2. So we take a shortcut here + and make it L right away, to avoid the + potentially costly loop below. This is + important when the buffer has a long series of + control characters, like binary nulls, and no + R2L characters at all. */ + && new_level == 0 + && !bidi_explicit_dir_char (bidi_it->ch) + && ((bidi_it->last_strong.type == STRONG_L) + || (bidi_it->last_strong.type == UNKNOWN_BT + && bidi_it->sos == L2R))) + type = STRONG_L; + else if (bidi_it->next_en_pos >= 0) { /* We overstepped the last known position for ET resolution but there could be other such characters @@ -2452,7 +2473,7 @@ typedef struct bpa_stack_entry { #define MAX_BPA_STACK ((int)max (MAX_ALLOCA / sizeof (bpa_stack_entry), 1)) /* UAX#9 says to match opening brackets with the matching closing - brackets or their canonical equivalents. As of Unicode 7.0, there + brackets or their canonical equivalents. As of Unicode 8.0, there are only 2 bracket characters that have canonical equivalence decompositions: u+2329 and u+232A. So instead of accessing the table in uni-decomposition.el, we just handle these 2 characters @@ -2471,10 +2492,10 @@ typedef struct bpa_stack_entry { And finally, cross-reference these two: - fgrep -w -f brackets.txt decompositions.txt + grep -Fw -f brackets.txt decompositions.txt where "decompositions.txt" was produced by the 1st script, and - "brackets.txt" by the 2nd script. In the output of fgrep, look + "brackets.txt" by the 2nd script. In the output of grep, look only for decompositions that don't begin with some compatibility formatting tag, such as "". Only decompositions that consist solely of character codepoints are relevant to bidi @@ -2482,8 +2503,8 @@ typedef struct bpa_stack_entry { #define CANONICAL_EQU(c) \ ( ASCII_CHAR_P (c) ? c \ - : (c) == 0x2329 ? 0x3008 \ - : (c) == 0x232a ? 0x3009 \ + : (c) == LEFT_POINTING_ANGLE_BRACKET ? LEFT_ANGLE_BRACKET \ + : (c) == RIGHT_POINTING_ANGLE_BRACKET ? RIGHT_ANGLE_BRACKET \ : c ) #ifdef ENABLE_CHECKING @@ -2948,7 +2969,7 @@ bidi_resolve_neutral (struct bidi_it *bidi_it) we are already at paragraph end. */ && (is_neutral || bidi_isolate_fmt_char (type))) /* N1-N2/Retaining */ - || (type == WEAK_BN && bidi_explicit_dir_char (bidi_it->ch))) + || type == WEAK_BN) { if (bidi_it->next_for_neutral.type != UNKNOWN_BT) { @@ -2978,8 +2999,10 @@ bidi_resolve_neutral (struct bidi_it *bidi_it) entering the expensive loop in the "else" clause. */ else if (current_level == 0 && bidi_it->prev_for_neutral.type == STRONG_L - && !bidi_explicit_dir_char (bidi_it->ch) - && !bidi_isolate_fmt_char (type)) + && (ASCII_CHAR_P (bidi_it->ch) + || (type != WEAK_BN + && !bidi_explicit_dir_char (bidi_it->ch) + && !bidi_isolate_fmt_char (type)))) type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type, STRONG_L, current_level); else if (/* current level is 1 */ @@ -2991,6 +3014,7 @@ bidi_resolve_neutral (struct bidi_it *bidi_it) && (bidi_it->prev_for_neutral.type == STRONG_R || bidi_it->prev_for_neutral.type == WEAK_EN || bidi_it->prev_for_neutral.type == WEAK_AN) + && type != WEAK_BN && !bidi_explicit_dir_char (bidi_it->ch) && !bidi_isolate_fmt_char (type)) type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type, @@ -2999,7 +3023,7 @@ bidi_resolve_neutral (struct bidi_it *bidi_it) { /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in the assumption of batch-style processing; see clauses W4, - W5, and especially N1, which require to look far forward + W5, and especially N1, which require looking far forward (as well as back) in the buffer/string. May the fleas of a thousand camels infest the armpits of those who design supposedly general-purpose algorithms by looking at their @@ -3158,7 +3182,7 @@ bidi_level_of_next_char (struct bidi_it *bidi_it) } } - /* Perhaps the character we want is already cached s fully resolved. + /* Perhaps the character we want is already cached as fully resolved. If it is, the call to bidi_cache_find below will return a type other than UNKNOWN_BT. */ if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt) @@ -3216,8 +3240,12 @@ bidi_level_of_next_char (struct bidi_it *bidi_it) it belongs to a sequence of WS characters preceding a newline or a TAB or a paragraph separator. */ if ((bidi_it->orig_type == NEUTRAL_WS + || bidi_it->orig_type == WEAK_BN || bidi_isolate_fmt_char (bidi_it->orig_type)) - && bidi_it->next_for_ws.charpos < bidi_it->charpos) + && bidi_it->next_for_ws.charpos < bidi_it->charpos + /* If this character is already at base level, we don't need to + reset it, so avoid the potentially costly loop below. */ + && level != bidi_it->level_stack[0].level) { int ch; ptrdiff_t clen = bidi_it->ch_len; @@ -3249,11 +3277,14 @@ bidi_level_of_next_char (struct bidi_it *bidi_it) /* Resolve implicit levels. */ if (bidi_it->orig_type == NEUTRAL_B /* L1 */ - || bidi_it->orig_type == NEUTRAL_S - || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB - || (bidi_it->orig_type == NEUTRAL_WS - && (bidi_it->next_for_ws.type == NEUTRAL_B - || bidi_it->next_for_ws.type == NEUTRAL_S))) + || bidi_it->orig_type == NEUTRAL_S + || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB + || ((bidi_it->orig_type == NEUTRAL_WS + || bidi_it->orig_type == WEAK_BN + || bidi_isolate_fmt_char (bidi_it->orig_type) + || bidi_explicit_dir_char (bidi_it->ch)) + && (bidi_it->next_for_ws.type == NEUTRAL_B + || bidi_it->next_for_ws.type == NEUTRAL_S))) level = bidi_it->level_stack[0].level; else if ((level & 1) == 0) /* I1 */ { @@ -3348,7 +3379,6 @@ bidi_move_to_visually_next (struct bidi_it *bidi_it) { int old_level, new_level, next_level; struct bidi_it sentinel; - struct gcpro gcpro1; if (bidi_it->charpos < 0 || bidi_it->bytepos < 0) emacs_abort (); @@ -3358,11 +3388,6 @@ bidi_move_to_visually_next (struct bidi_it *bidi_it) bidi_it->scan_dir = 1; /* default to logical order */ } - /* The code below can call eval, and thus cause GC. If we are - iterating a Lisp string, make sure it won't be GCed. */ - if (STRINGP (bidi_it->string.lstring)) - GCPRO1 (bidi_it->string.lstring); - /* If we just passed a newline, initialize for the next line. */ if (!bidi_it->first_elt && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB)) @@ -3508,9 +3533,6 @@ bidi_move_to_visually_next (struct bidi_it *bidi_it) eassert (bidi_it->resolved_level >= 0 && bidi_it->resolved_level <= BIDI_MAXDEPTH + 2); - - if (STRINGP (bidi_it->string.lstring)) - UNGCPRO; } /* Utility function for looking for strong directional characters