X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/7e09ef09a479731d01b1ca46e94ddadd73ac98e3..7c1802f6ffc2704ba8042c7c1c6faa73dfa210d1:/src/bidi.c diff --git a/src/bidi.c b/src/bidi.c index ef0092f3d9..c23ff95435 100644 --- a/src/bidi.c +++ b/src/bidi.c @@ -1,13 +1,13 @@ /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs. - Copyright (C) 2000-2001, 2004-2005, 2009-2015 Free Software + Copyright (C) 2000-2001, 2004-2005, 2009-2016 Free Software Foundation, Inc. This file is part of GNU Emacs. GNU Emacs is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. +the Free Software Foundation, either version 3 of the License, or (at +your option) any later version. GNU Emacs is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -262,7 +262,6 @@ typedef enum { } bidi_category_t; static Lisp_Object paragraph_start_re, paragraph_separate_re; -static Lisp_Object Qparagraph_start, Qparagraph_separate; /*********************************************************************** @@ -533,7 +532,7 @@ bidi_copy_it (struct bidi_it *to, struct bidi_it *from) /* Copy everything from the start through the active part of the level stack. */ memcpy (to, from, - (offsetof (struct bidi_it, level_stack[1]) + (offsetof (struct bidi_it, level_stack) + sizeof from->level_stack[0] + from->stack_idx * sizeof from->level_stack[0])); } @@ -1314,13 +1313,13 @@ bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos, /* `(space ...)' display specs are handled as paragraph separators for the purposes of the reordering; see UAX#9 section 3 and clause HL1 in section 4.3 there. */ - ch = 0x2029; + ch = PARAGRAPH_SEPARATOR; } else { /* All other display specs are handled as the Unicode Object Replacement Character. */ - ch = 0xFFFC; + ch = OBJECT_REPLACEMENT_CHARACTER; } disp_end_pos = compute_display_string_end (*disp_pos, string); if (disp_end_pos < 0) @@ -1800,6 +1799,11 @@ bidi_explicit_dir_char (int ch) if (!bidi_initialized) emacs_abort (); + if (ch < 0) + { + eassert (ch == BIDI_EOB); + return false; + } ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch)); return (ch_type == LRE || ch_type == LRO || ch_type == RLE || ch_type == RLO @@ -1915,8 +1919,6 @@ bidi_resolve_explicit (struct bidi_it *bidi_it) { eassert (bidi_it->prev.charpos == bidi_it->charpos - 1); prev_type = bidi_it->prev.orig_type; - if (prev_type == FSI) - prev_type = bidi_it->type_after_wn; } } /* Don't move at end of buffer/string. */ @@ -1931,8 +1933,6 @@ bidi_resolve_explicit (struct bidi_it *bidi_it) emacs_abort (); bidi_it->bytepos += bidi_it->ch_len; prev_type = bidi_it->orig_type; - if (prev_type == FSI) - prev_type = bidi_it->type_after_wn; } else /* EOB or end of string */ prev_type = NEUTRAL_B; @@ -2087,10 +2087,17 @@ bidi_resolve_explicit (struct bidi_it *bidi_it) if (typ1 != STRONG_R && typ1 != STRONG_AL) { type = LRI; + /* Override orig_type, which will be needed when we come to + examine the next character, which is the first character + inside the isolate. */ + bidi_it->orig_type = type; goto fsi_as_lri; } else - type = RLI; + { + type = RLI; + bidi_it->orig_type = type; + } /* FALLTHROUGH */ case RLI: /* X5a */ if (override == NEUTRAL_DIR) @@ -2311,7 +2318,31 @@ bidi_resolve_weak (struct bidi_it *bidi_it) if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */ type = WEAK_EN; } - else if (bidi_it->next_en_pos >=0) + else if (type == WEAK_BN + /* This condition is for the following important case: + + . we are at level zero + . either previous strong character was L, + or we've seen no strong characters since sos + and the base paragraph direction is L2R + . this BN is NOT a bidi directional control + + For such a situation, either this BN will be + converted to EN per W5, and then to L by virtue + of W7; or it will become ON per W6, and then L + because of N1/N2. So we take a shortcut here + and make it L right away, to avoid the + potentially costly loop below. This is + important when the buffer has a long series of + control characters, like binary nulls, and no + R2L characters at all. */ + && new_level == 0 + && !bidi_explicit_dir_char (bidi_it->ch) + && ((bidi_it->last_strong.type == STRONG_L) + || (bidi_it->last_strong.type == UNKNOWN_BT + && bidi_it->sos == L2R))) + type = STRONG_L; + else if (bidi_it->next_en_pos >= 0) { /* We overstepped the last known position for ET resolution but there could be other such characters @@ -2448,7 +2479,7 @@ typedef struct bpa_stack_entry { #define MAX_BPA_STACK ((int)max (MAX_ALLOCA / sizeof (bpa_stack_entry), 1)) /* UAX#9 says to match opening brackets with the matching closing - brackets or their canonical equivalents. As of Unicode 7.0, there + brackets or their canonical equivalents. As of Unicode 8.0, there are only 2 bracket characters that have canonical equivalence decompositions: u+2329 and u+232A. So instead of accessing the table in uni-decomposition.el, we just handle these 2 characters @@ -2478,8 +2509,8 @@ typedef struct bpa_stack_entry { #define CANONICAL_EQU(c) \ ( ASCII_CHAR_P (c) ? c \ - : (c) == 0x2329 ? 0x3008 \ - : (c) == 0x232a ? 0x3009 \ + : (c) == LEFT_POINTING_ANGLE_BRACKET ? LEFT_ANGLE_BRACKET \ + : (c) == RIGHT_POINTING_ANGLE_BRACKET ? RIGHT_ANGLE_BRACKET \ : c ) #ifdef ENABLE_CHECKING @@ -2944,7 +2975,7 @@ bidi_resolve_neutral (struct bidi_it *bidi_it) we are already at paragraph end. */ && (is_neutral || bidi_isolate_fmt_char (type))) /* N1-N2/Retaining */ - || (type == WEAK_BN && bidi_explicit_dir_char (bidi_it->ch))) + || type == WEAK_BN) { if (bidi_it->next_for_neutral.type != UNKNOWN_BT) { @@ -2974,8 +3005,10 @@ bidi_resolve_neutral (struct bidi_it *bidi_it) entering the expensive loop in the "else" clause. */ else if (current_level == 0 && bidi_it->prev_for_neutral.type == STRONG_L - && !bidi_explicit_dir_char (bidi_it->ch) - && !bidi_isolate_fmt_char (type)) + && (ASCII_CHAR_P (bidi_it->ch) + || (type != WEAK_BN + && !bidi_explicit_dir_char (bidi_it->ch) + && !bidi_isolate_fmt_char (type)))) type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type, STRONG_L, current_level); else if (/* current level is 1 */ @@ -2987,6 +3020,7 @@ bidi_resolve_neutral (struct bidi_it *bidi_it) && (bidi_it->prev_for_neutral.type == STRONG_R || bidi_it->prev_for_neutral.type == WEAK_EN || bidi_it->prev_for_neutral.type == WEAK_AN) + && type != WEAK_BN && !bidi_explicit_dir_char (bidi_it->ch) && !bidi_isolate_fmt_char (type)) type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type, @@ -2995,7 +3029,7 @@ bidi_resolve_neutral (struct bidi_it *bidi_it) { /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in the assumption of batch-style processing; see clauses W4, - W5, and especially N1, which require to look far forward + W5, and especially N1, which require looking far forward (as well as back) in the buffer/string. May the fleas of a thousand camels infest the armpits of those who design supposedly general-purpose algorithms by looking at their @@ -3154,7 +3188,7 @@ bidi_level_of_next_char (struct bidi_it *bidi_it) } } - /* Perhaps the character we want is already cached s fully resolved. + /* Perhaps the character we want is already cached as fully resolved. If it is, the call to bidi_cache_find below will return a type other than UNKNOWN_BT. */ if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt) @@ -3212,8 +3246,12 @@ bidi_level_of_next_char (struct bidi_it *bidi_it) it belongs to a sequence of WS characters preceding a newline or a TAB or a paragraph separator. */ if ((bidi_it->orig_type == NEUTRAL_WS + || bidi_it->orig_type == WEAK_BN || bidi_isolate_fmt_char (bidi_it->orig_type)) - && bidi_it->next_for_ws.charpos < bidi_it->charpos) + && bidi_it->next_for_ws.charpos < bidi_it->charpos + /* If this character is already at base level, we don't need to + reset it, so avoid the potentially costly loop below. */ + && level != bidi_it->level_stack[0].level) { int ch; ptrdiff_t clen = bidi_it->ch_len; @@ -3245,11 +3283,14 @@ bidi_level_of_next_char (struct bidi_it *bidi_it) /* Resolve implicit levels. */ if (bidi_it->orig_type == NEUTRAL_B /* L1 */ - || bidi_it->orig_type == NEUTRAL_S - || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB - || (bidi_it->orig_type == NEUTRAL_WS - && (bidi_it->next_for_ws.type == NEUTRAL_B - || bidi_it->next_for_ws.type == NEUTRAL_S))) + || bidi_it->orig_type == NEUTRAL_S + || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB + || ((bidi_it->orig_type == NEUTRAL_WS + || bidi_it->orig_type == WEAK_BN + || bidi_isolate_fmt_char (bidi_it->orig_type) + || bidi_explicit_dir_char (bidi_it->ch)) + && (bidi_it->next_for_ws.type == NEUTRAL_B + || bidi_it->next_for_ws.type == NEUTRAL_S))) level = bidi_it->level_stack[0].level; else if ((level & 1) == 0) /* I1 */ { @@ -3344,7 +3385,6 @@ bidi_move_to_visually_next (struct bidi_it *bidi_it) { int old_level, new_level, next_level; struct bidi_it sentinel; - struct gcpro gcpro1; if (bidi_it->charpos < 0 || bidi_it->bytepos < 0) emacs_abort (); @@ -3354,11 +3394,6 @@ bidi_move_to_visually_next (struct bidi_it *bidi_it) bidi_it->scan_dir = 1; /* default to logical order */ } - /* The code below can call eval, and thus cause GC. If we are - iterating a Lisp string, make sure it won't be GCed. */ - if (STRINGP (bidi_it->string.lstring)) - GCPRO1 (bidi_it->string.lstring); - /* If we just passed a newline, initialize for the next line. */ if (!bidi_it->first_elt && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB)) @@ -3504,9 +3539,6 @@ bidi_move_to_visually_next (struct bidi_it *bidi_it) eassert (bidi_it->resolved_level >= 0 && bidi_it->resolved_level <= BIDI_MAXDEPTH + 2); - - if (STRINGP (bidi_it->string.lstring)) - UNGCPRO; } /* Utility function for looking for strong directional characters