code.delx.au - gnu-emacs/blob - src/bidi.c

   1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
   2    Copyright (C) 2000-2001, 2004-2005, 2009-2014 Free Software
   3    Foundation, Inc.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 /* Written by Eli Zaretskii <eliz@gnu.org>.
  21
  22    A sequential implementation of the Unicode Bidirectional algorithm,
  23    (UBA) as per UAX#9, a part of the Unicode Standard.
  24
  25    Unlike the Reference Implementation and most other implementations,
  26    this one is designed to be called once for every character in the
  27    buffer or string.  That way, we can leave intact the design of the
  28    Emacs display engine, whereby an iterator object is used to
  29    traverse buffer or string text character by character, and generate
  30    the necessary data for displaying each character in 'struct glyph'
  31    objects.  (See xdisp.c for the details of that iteration.)  The
  32    functions on this file replace the original linear iteration in the
  33    logical order of the text with a non-linear iteration in the visual
  34    order, i.e. in the order characters should be shown on display.
  35
  36    The main entry point is bidi_move_to_visually_next.  Each time it
  37    is called, it finds the next character in the visual order, and
  38    returns its information in a special structure.  The caller is then
  39    expected to process this character for display or any other
  40    purposes, and call bidi_move_to_visually_next for the next
  41    character.  See the comments in bidi_move_to_visually_next for more
  42    details about its algorithm that finds the next visual-order
  43    character by resolving their levels on the fly.
  44
  45    Two other entry points are bidi_paragraph_init and
  46    bidi_mirror_char.  The first determines the base direction of a
  47    paragraph, while the second returns the mirrored version of its
  48    argument character.
  49
  50    A few auxiliary entry points are used to initialize the bidi
  51    iterator for iterating an object (buffer or string), push and pop
  52    the bidi iterator state, and save and restore the state of the bidi
  53    cache.
  54
  55    If you want to understand the code, you will have to read it
  56    together with the relevant portions of UAX#9.  The comments include
  57    references to UAX#9 rules, for that very reason.
  58
  59    A note about references to UAX#9 rules: if the reference says
  60    something like "X9/Retaining", it means that you need to refer to
  61    rule X9 and to its modifications described in the "Implementation
  62    Notes" section of UAX#9, under "Retaining Format Codes".
  63
  64    Here's the overview of the design of the reordering engine
  65    implemented by this file.
  66
  67    Basic implementation structure
  68    ------------------------------
  69
  70    The sequential processing steps described by UAX#9 are implemented
  71    as recursive levels of processing, all of which examine the next
  72    character in the logical order.  This hierarchy of processing looks
  73    as follows, from the innermost (deepest) to the outermost level,
  74    omitting some subroutines used by each level:
  75
  76      bidi_fetch_char         -- fetch next character
  77      bidi_resolve_explicit   -- resolve explicit levels and directions
  78      bidi_resolve_weak       -- resolve weak types
  79      bidi_resolve_brackets   -- resolve "paired brackets" neutral types
  80      bidi_resolve_neutral    -- resolve neutral types
  81      bidi_level_of_next_char -- resolve implicit levels
  82
  83    Each level calls the level below it, and works on the result
  84    returned by the lower level, including all of its sub-levels.
  85
  86    Unlike all the levels below it, bidi_level_of_next_char can return
  87    the information about either the next or previous character in the
  88    logical order, depending on the current direction of scanning the
  89    buffer or string.  For the next character, it calls all the levels
  90    below it; for the previous character, it uses the cache, described
  91    below.
  92
  93    Thus, the result of calling bidi_level_of_next_char is the resolved
  94    level of the next or the previous character in the logical order.
  95    Based on this information, the function bidi_move_to_visually_next
  96    finds the next character in the visual order and updates the
  97    direction in which the buffer is scanned, either forward or
  98    backward, to find the next character to be displayed.  (Text is
  99    scanned backwards when it needs to be reversed for display, i.e. if
 100    the visual order is the inverse of the logical order.)  This
 101    implements the last, reordering steps of the UBA, by successively
 102    calling bidi_level_of_next_char until the character of the required
 103    embedding level is found; the scan direction is dynamically updated
 104    as a side effect.  See the commentary before the 'while' loop in
 105    bidi_move_to_visually_next, for the details.
 106
 107    Fetching characters
 108    -------------------
 109
 110    In a nutshell, fetching the next character boils down to calling
 111    STRING_CHAR_AND_LENGTH, passing it the address of a buffer or
 112    string position.  See bidi_fetch_char.  However, if the next
 113    character is "covered" by a display property of some kind,
 114    bidi_fetch_char returns the u+FFFC "object replacement character"
 115    that represents the entire run of text covered by the display
 116    property.  (The ch_len and nchars members of 'struct bidi_it'
 117    reflect the length in bytes and characters of that text.)  This is
 118    so we reorder text on both sides of the display property as
 119    appropriate for an image or embedded string.  Similarly, text
 120    covered by a display spec of the form '(space ...)', is replaced
 121    with the u+2029 paragraph separator character, so such display
 122    specs produce the same effect as a TAB under UBA.  Both these
 123    special characters are not actually displayed -- the display
 124    property is displayed instead -- but just used to compute the
 125    embedding level of the surrounding text so as to produce the
 126    required effect.
 127
 128    Bidi iterator states
 129    --------------------
 130
 131    The UBA is highly context dependent in some of its parts,
 132    i.e. results of processing a character can generally depend on
 133    characters very far away.  The UAX#9 description of the UBA
 134    prescribes a stateful processing of each character, whereby the
 135    results of this processing depend on various state variables, such
 136    as the current embedding level, level stack, and directional
 137    override status.  In addition, the UAX#9 description includes many
 138    passages like this (from rule W2 in this case):
 139
 140      Search backward from each instance of a European number until the
 141      first strong type (R, L, AL, or sos) is found. If an AL is found,
 142      change the type of the European number to Arabic number.
 143
 144    To support this, we use a bidi iterator object, 'struct bidi_it',
 145    which is a sub-structure of 'struct it' used by xdisp.c (see
 146    dispextern.h for the definition of both of these structures).  The
 147    bidi iterator holds the entire state of the iteration required by
 148    the UBA, and is updated as the text is traversed.  In particular,
 149    the embedding level of the current character being resolved is
 150    recorded in the iterator state.  To avoid costly searches backward
 151    in support of rules like W2 above, the necessary character types
 152    are also recorded in the iterator state as they are found during
 153    the forward scan, and then used when such rules need to be applied.
 154    (Forward scans cannot be avoided in this way; they need to be
 155    performed at least once, and the results recorded in the iterator
 156    state, to be reused until the forward scan oversteps the recorded
 157    position.)
 158
 159    In this manner, the iterator state acts as a mini-cache of
 160    contextual information required for resolving the level of the
 161    current character by various UBA rules.
 162
 163    Caching of bidi iterator states
 164    -------------------------------
 165
 166    As described above, the reordering engine uses the information
 167    recorded in the bidi iterator state in order to resolve the
 168    embedding level of the current character.  When the reordering
 169    engine needs to process the next character in the logical order, it
 170    fetches it and applies to it all the UBA levels, updating the
 171    iterator state as it goes.  But when the buffer or string is
 172    scanned backwards, i.e. in the reverse order of buffer/string
 173    positions, the scanned characters were already processed during the
 174    preceding forward scan (see bidi_find_other_level_edge).  To avoid
 175    costly re-processing of characters that were already processed
 176    during the forward scan, the iterator states computed while
 177    scanning forward are cached.
 178
 179    The cache is just a linear array of 'struct bidi_it' objects, which
 180    is dynamically allocated and reallocated as needed, since the size
 181    of the cache depends on the text being processed.  We only need the
 182    cache while processing embedded levels higher than the base
 183    paragraph embedding level, because these higher levels require
 184    changes in scan direction.  Therefore, as soon as we are back to
 185    the base embedding level, we can free the cache; see the calls to
 186    bidi_cache_reset and bidi_cache_shrink, for the conditions to do
 187    this.
 188
 189    The cache maintains the index of the next unused cache slot -- this
 190    is where the next iterator state will be cached.  The function
 191    bidi_cache_iterator_state saves an instance of the state in the
 192    cache and increments the unused slot index.  The companion function
 193    bidi_cache_find looks up a cached state that corresponds to a given
 194    buffer/string position.  All of the cached states must correspond
 195    1:1 to the buffer or string region whose processing they reflect;
 196    bidi.c will abort if it finds cache slots that violate this 1:1
 197    correspondence.
 198
 199    When the parent iterator 'struct it' is pushed (see push_it in
 200    xdisp.c) to pause the current iteration and start iterating over a
 201    different object (e.g., a 'display' string that covers some buffer
 202    text), the bidi iterator cache needs to be "pushed" as well, so
 203    that a new empty cache could be used while iterating over the new
 204    object.  Later, when the new object is exhausted, and xdisp.c calls
 205    pop_it, we need to "pop" the bidi cache as well and return to the
 206    original cache.  See bidi_push_it and bidi_pop_it for how this is
 207    done.
 208
 209    Some functions of the display engine save copies of 'struct it' in
 210    local variables, and restore them later.  For examples, see
 211    pos_visible_p and move_it_in_display_line_to in xdisp.c, and
 212    window_scroll_pixel_based in window.c.  When this happens, we need
 213    to save and restore the bidi cache as well, because conceptually
 214    the cache is part of the 'struct it' state, and needs to be in
 215    perfect sync with the portion of the buffer/string that is being
 216    processed.  This saving and restoring of the cache state is handled
 217    by bidi_shelve_cache and bidi_unshelve_cache, and the helper macros
 218    SAVE_IT and RESTORE_IT defined on xdisp.c.
 219
 220    Note that, because reordering is implemented below the level in
 221    xdisp.c that breaks glyphs into screen lines, we are violating
 222    paragraph 3.4 of UAX#9. which mandates that line breaking shall be
 223    done before reordering each screen line separately.  However,
 224    following UAX#9 to the letter in this matter goes against the basic
 225    design of the Emacs display engine, and so we choose here this
 226    minor deviation from the UBA letter in preference to redesign of
 227    the display engine.  The effect of this is only seen in continued
 228    lines that are broken into screen lines in the middle of a run
 229    whose direction is opposite to the paragraph's base direction.
 230
 231    Important design and implementation note: when the code needs to
 232    scan far ahead, be sure to avoid such scans as much as possible
 233    when the buffer/string doesn't contain any RTL characters.  Users
 234    of left-to-right scripts will never forgive you if you introduce
 235    some slow-down due to bidi in situations that don't involve any
 236    bidirectional text.  See the large comment near the beginning of
 237    bidi_resolve_neutral, for one situation where such shortcut was
 238    necessary.  */
 239
 240 #include <config.h>
 241 #include <stdio.h>
 242
 243 #include "lisp.h"
 244 #include "character.h"
 245 #include "buffer.h"
 246 #include "dispextern.h"
 247 #include "region-cache.h"
 248
 249 static bool bidi_initialized = 0;
 250
 251 static Lisp_Object bidi_type_table, bidi_mirror_table, bidi_brackets_table;
 252
 253 #define BIDI_EOB   (-1)
 254
 255 /* Data type for describing the bidirectional character categories.  */
 256 typedef enum {
 257   UNKNOWN_BC,
 258   NEUTRAL,
 259   WEAK,
 260   STRONG,
 261   EXPLICIT_FORMATTING
 262 } bidi_category_t;
 263
 264 static Lisp_Object paragraph_start_re, paragraph_separate_re;
 265 static Lisp_Object Qparagraph_start, Qparagraph_separate;
 266
 267 \f
 268 /***********************************************************************
 269                         Utilities
 270  ***********************************************************************/
 271
 272 /* Return the bidi type of a character CH, subject to the current
 273    directional OVERRIDE.  */
 274 static bidi_type_t
 275 bidi_get_type (int ch, bidi_dir_t override)
 276 {
 277   bidi_type_t default_type;
 278
 279   if (ch == BIDI_EOB)
 280     return NEUTRAL_B;
 281   if (ch < 0 || ch > MAX_CHAR)
 282     emacs_abort ();
 283
 284   default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
 285   /* Every valid character code, even those that are unassigned by the
 286      UCD, have some bidi-class property, according to
 287      DerivedBidiClass.txt file.  Therefore, if we ever get UNKNOWN_BT
 288      (= zero) code from CHAR_TABLE_REF, that's a bug.  */
 289   if (default_type == UNKNOWN_BT)
 290     emacs_abort ();
 291
 292   switch (default_type)
 293     {
 294       case WEAK_BN:
 295       case NEUTRAL_B:
 296       case LRE:
 297       case LRO:
 298       case RLE:
 299       case RLO:
 300       case PDF:
 301       case LRI:
 302       case RLI:
 303       case FSI:
 304       case PDI:
 305         return default_type;
 306       default:
 307         if (override == L2R)
 308           return STRONG_L;
 309         else if (override == R2L)
 310           return STRONG_R;
 311         else
 312           return default_type;
 313     }
 314 }
 315
 316 static void
 317 bidi_check_type (bidi_type_t type)
 318 {
 319   eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
 320 }
 321
 322 /* Given a bidi TYPE of a character, return its category.  */
 323 static bidi_category_t
 324 bidi_get_category (bidi_type_t type)
 325 {
 326   switch (type)
 327     {
 328       case UNKNOWN_BT:
 329         return UNKNOWN_BC;
 330       case STRONG_L:
 331       case STRONG_R:
 332       case STRONG_AL:
 333         return STRONG;
 334       case WEAK_EN:
 335       case WEAK_ES:
 336       case WEAK_ET:
 337       case WEAK_AN:
 338       case WEAK_CS:
 339       case WEAK_NSM:
 340       case WEAK_BN:
 341         return WEAK;
 342       case NEUTRAL_B:
 343       case NEUTRAL_S:
 344       case NEUTRAL_WS:
 345       case NEUTRAL_ON:
 346         return NEUTRAL;
 347       case LRE:
 348       case LRO:
 349       case RLE:
 350       case RLO:
 351       case PDF:
 352       case LRI:
 353       case RLI:
 354       case FSI:
 355       case PDI:
 356         return EXPLICIT_FORMATTING;
 357       default:
 358         emacs_abort ();
 359     }
 360 }
 361
 362 static bool
 363 bidi_isolate_fmt_char (bidi_type_t ch_type)
 364 {
 365   return (ch_type == LRI || ch_type == RLI || ch_type == PDI || ch_type == FSI);
 366 }
 367
 368 /* Return the mirrored character of C, if it has one.  If C has no
 369    mirrored counterpart, return C.
 370    Note: The conditions in UAX#9 clause L4 regarding the surrounding
 371    context must be tested by the caller.  */
 372 int
 373 bidi_mirror_char (int c)
 374 {
 375   Lisp_Object val;
 376
 377   if (c == BIDI_EOB)
 378     return c;
 379   if (c < 0 || c > MAX_CHAR)
 380     emacs_abort ();
 381
 382   val = CHAR_TABLE_REF (bidi_mirror_table, c);
 383   if (INTEGERP (val))
 384     {
 385       int v;
 386
 387       /* When debugging, check before assigning to V, so that the check
 388          isn't broken by undefined behavior due to int overflow.  */
 389       eassert (CHAR_VALID_P (XINT (val)));
 390
 391       v = XINT (val);
 392
 393       /* Minimal test we must do in optimized builds, to prevent weird
 394          crashes further down the road.  */
 395       if (v < 0 || v > MAX_CHAR)
 396         emacs_abort ();
 397
 398       return v;
 399     }
 400
 401   return c;
 402 }
 403
 404 /* Return the Bidi_Paired_Bracket_Type property of the character C.  */
 405 static bidi_bracket_type_t
 406 bidi_paired_bracket_type (int c)
 407 {
 408   if (c == BIDI_EOB)
 409     return BIDI_BRACKET_NONE;
 410   if (c < 0 || c > MAX_CHAR)
 411     emacs_abort ();
 412
 413   return (bidi_bracket_type_t) XINT (CHAR_TABLE_REF (bidi_brackets_table, c));
 414 }
 415
 416 /* Determine the start-of-sequence (sos) directional type given the two
 417    embedding levels on either side of the run boundary.  Also, update
 418    the saved info about previously seen characters, since that info is
 419    generally valid for a single level run.  */
 420 static void
 421 bidi_set_sos_type (struct bidi_it *bidi_it, int level_before, int level_after)
 422 {
 423   int higher_level = (level_before > level_after ? level_before : level_after);
 424
 425   /* FIXME: should the default sos direction be user selectable?  */
 426   bidi_it->sos = ((higher_level & 1) != 0 ? R2L : L2R); /* X10 */
 427
 428   bidi_it->prev.type = UNKNOWN_BT;
 429   bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
 430   bidi_it->prev_for_neutral.type = (bidi_it->sos == R2L ? STRONG_R : STRONG_L);
 431   bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
 432   bidi_it->next_for_neutral.type
 433     = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 434 }
 435
 436 /* Push the current embedding level and override status; reset the
 437    current level to LEVEL and the current override status to OVERRIDE.  */
 438 static void
 439 bidi_push_embedding_level (struct bidi_it *bidi_it,
 440                            int level, bidi_dir_t override, bool isolate_status)
 441 {
 442   struct bidi_stack *st;
 443   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
 444
 445   bidi_it->stack_idx++;
 446   eassert (bidi_it->stack_idx < BIDI_MAXDEPTH+2+1);
 447   st = &bidi_it->level_stack[bidi_it->stack_idx];
 448   eassert (level <= (1 << 7));
 449   st->level = level;
 450   st->override = override;
 451   st->isolate_status = isolate_status;
 452   if (isolate_status)
 453     {
 454       st->last_strong = bidi_it->last_strong;
 455       st->prev_for_neutral = bidi_it->prev_for_neutral;
 456       st->next_for_neutral = bidi_it->next_for_neutral;
 457       st->sos = bidi_it->sos;
 458     }
 459   /* We've got a new isolating sequence, compute the directional type
 460      of sos and initialize per-sequence variables (UAX#9, clause X10).  */
 461   bidi_set_sos_type (bidi_it, prev_level, level);
 462 }
 463
 464 /* Pop from the stack the embedding level, the directional override
 465    status, and optionally saved information for the isolating run
 466    sequence.  Return the new level.  */
 467 static int
 468 bidi_pop_embedding_level (struct bidi_it *bidi_it)
 469 {
 470   int level;
 471
 472   /* UAX#9 says to ignore invalid PDFs (X7, last bullet)
 473      and PDIs (X6a, 2nd bullet).  */
 474   if (bidi_it->stack_idx > 0)
 475     {
 476       bool isolate_status
 477         = bidi_it->level_stack[bidi_it->stack_idx].isolate_status;
 478       int old_level = bidi_it->level_stack[bidi_it->stack_idx].level;
 479
 480       struct bidi_stack st;
 481
 482       st = bidi_it->level_stack[bidi_it->stack_idx];
 483       if (isolate_status)
 484         {
 485           /* PREV is used in W1 for resolving WEAK_NSM.  By the time
 486              we get to an NSM, we must have gotten past at least one
 487              character: the PDI that ends the isolate from which we
 488              are popping here.  So PREV will have been filled up by
 489              the time we first use it.  We initialize it here to
 490              UNKNOWN_BT to be able to catch any blunders in this
 491              logic.  */
 492           bidi_it->prev.orig_type = bidi_it->prev.type = UNKNOWN_BT;
 493           bidi_it->last_strong = st.last_strong;
 494           bidi_it->prev_for_neutral = st.prev_for_neutral;
 495           bidi_it->next_for_neutral = st.next_for_neutral;
 496           bidi_it->sos = st.sos;
 497         }
 498       else
 499         bidi_set_sos_type (bidi_it, old_level,
 500                            bidi_it->level_stack[bidi_it->stack_idx - 1].level);
 501
 502       bidi_it->stack_idx--;
 503     }
 504   level = bidi_it->level_stack[bidi_it->stack_idx].level;
 505   eassert (0 <= level && level <= BIDI_MAXDEPTH + 1);
 506   return level;
 507 }
 508
 509 /* Record in SAVED_INFO the information about the current character.  */
 510 static void
 511 bidi_remember_char (struct bidi_saved_info *saved_info,
 512                     struct bidi_it *bidi_it, bool from_type)
 513 {
 514   saved_info->charpos = bidi_it->charpos;
 515   if (from_type)
 516     saved_info->type = bidi_it->type;
 517   else
 518     saved_info->type = bidi_it->type_after_wn;
 519   bidi_check_type (saved_info->type);
 520   saved_info->orig_type = bidi_it->orig_type;
 521   bidi_check_type (saved_info->orig_type);
 522 }
 523
 524 /* Copy the bidi iterator from FROM to TO.  To save cycles, this only
 525    copies the part of the level stack that is actually in use.  */
 526 static void
 527 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
 528 {
 529   /* Copy everything from the start through the active part of
 530      the level stack.  */
 531   memcpy (to, from,
 532           (offsetof (struct bidi_it, level_stack[1])
 533            + from->stack_idx * sizeof from->level_stack[0]));
 534 }
 535
 536 \f
 537 /***********************************************************************
 538                         Caching the bidi iterator states
 539  ***********************************************************************/
 540
 541 /* We allocate and de-allocate the cache in chunks of this size (in
 542    characters).  200 was chosen as an upper limit for reasonably-long
 543    lines in a text file/buffer.  */
 544 #define BIDI_CACHE_CHUNK 200
 545 static struct bidi_it *bidi_cache;
 546 static ptrdiff_t bidi_cache_size = 0;
 547 enum { elsz = sizeof (struct bidi_it) };
 548 static ptrdiff_t bidi_cache_idx;        /* next unused cache slot */
 549 static ptrdiff_t bidi_cache_last_idx;   /* slot of last cache hit */
 550 static ptrdiff_t bidi_cache_start = 0;  /* start of cache for this
 551                                            "stack" level */
 552
 553 /* 5-slot stack for saving the start of the previous level of the
 554    cache.  xdisp.c maintains a 5-slot stack for its iterator state,
 555    and we need the same size of our stack.  */
 556 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
 557 static int bidi_cache_sp;
 558
 559 /* Size of header used by bidi_shelve_cache.  */
 560 enum
 561   {
 562     bidi_shelve_header_size
 563       = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
 564          + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
 565          + sizeof (bidi_cache_last_idx))
 566   };
 567
 568 /* Reset the cache state to the empty state.  We only reset the part
 569    of the cache relevant to iteration of the current object.  Previous
 570    objects, which are pushed on the display iterator's stack, are left
 571    intact.  This is called when the cached information is no more
 572    useful for the current iteration, e.g. when we were reseated to a
 573    new position on the same object.  */
 574 static void
 575 bidi_cache_reset (void)
 576 {
 577   bidi_cache_idx = bidi_cache_start;
 578   bidi_cache_last_idx = -1;
 579 }
 580
 581 /* Shrink the cache to its minimal size.  Called when we init the bidi
 582    iterator for reordering a buffer or a string that does not come
 583    from display properties, because that means all the previously
 584    cached info is of no further use.  */
 585 static void
 586 bidi_cache_shrink (void)
 587 {
 588   if (bidi_cache_size > BIDI_CACHE_CHUNK)
 589     {
 590       bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
 591       bidi_cache_size = BIDI_CACHE_CHUNK;
 592     }
 593   bidi_cache_reset ();
 594 }
 595
 596 static void
 597 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
 598 {
 599   int current_scan_dir = bidi_it->scan_dir;
 600
 601   if (idx < bidi_cache_start || idx >= bidi_cache_idx)
 602     emacs_abort ();
 603
 604   bidi_copy_it (bidi_it, &bidi_cache[idx]);
 605   bidi_it->scan_dir = current_scan_dir;
 606   bidi_cache_last_idx = idx;
 607 }
 608
 609 /* Find a cached state with a given CHARPOS and resolved embedding
 610    level less or equal to LEVEL.  If LEVEL is -1, disregard the
 611    resolved levels in cached states.  DIR, if non-zero, means search
 612    in that direction from the last cache hit.  */
 613 static ptrdiff_t
 614 bidi_cache_search (ptrdiff_t charpos, int level, int dir)
 615 {
 616   ptrdiff_t i, i_start;
 617
 618   if (bidi_cache_idx > bidi_cache_start)
 619     {
 620       if (bidi_cache_last_idx == -1)
 621         bidi_cache_last_idx = bidi_cache_idx - 1;
 622       if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
 623         {
 624           dir = -1;
 625           i_start = bidi_cache_last_idx - 1;
 626         }
 627       else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
 628                           + bidi_cache[bidi_cache_last_idx].nchars - 1))
 629         {
 630           dir = 1;
 631           i_start = bidi_cache_last_idx + 1;
 632         }
 633       else if (dir)
 634         i_start = bidi_cache_last_idx;
 635       else
 636         {
 637           dir = -1;
 638           i_start = bidi_cache_idx - 1;
 639         }
 640
 641       if (dir < 0)
 642         {
 643           /* Linear search for now; FIXME!  */
 644           for (i = i_start; i >= bidi_cache_start; i--)
 645             if (bidi_cache[i].charpos <= charpos
 646                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 647                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 648               return i;
 649         }
 650       else
 651         {
 652           for (i = i_start; i < bidi_cache_idx; i++)
 653             if (bidi_cache[i].charpos <= charpos
 654                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 655                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 656               return i;
 657         }
 658     }
 659
 660   return -1;
 661 }
 662
 663 /* Find a cached state where the resolved level changes to a value
 664    that is lower than LEVEL, and return its cache slot index.  DIR is
 665    the direction to search, starting with the last used cache slot.
 666    If DIR is zero, we search backwards from the last occupied cache
 667    slot.  BEFORE means return the index of the slot that
 668    is ``before'' the level change in the search direction.  That is,
 669    given the cached levels like this:
 670
 671          1122333442211
 672           AB        C
 673
 674    and assuming we are at the position cached at the slot marked with
 675    C, searching backwards (DIR = -1) for LEVEL = 2 will return the
 676    index of slot B or A, depending whether BEFORE is, respectively,
 677    true or false.  */
 678 static ptrdiff_t
 679 bidi_cache_find_level_change (int level, int dir, bool before)
 680 {
 681   if (bidi_cache_idx)
 682     {
 683       ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
 684       int incr = before ? 1 : 0;
 685
 686       eassert (!dir || bidi_cache_last_idx >= 0);
 687
 688       if (!dir)
 689         dir = -1;
 690       else if (!incr)
 691         i += dir;
 692
 693       if (dir < 0)
 694         {
 695           while (i >= bidi_cache_start + incr)
 696             {
 697               if (bidi_cache[i - incr].resolved_level >= 0
 698                   && bidi_cache[i - incr].resolved_level < level)
 699                 return i;
 700               i--;
 701             }
 702         }
 703       else
 704         {
 705           while (i < bidi_cache_idx - incr)
 706             {
 707               if (bidi_cache[i + incr].resolved_level >= 0
 708                   && bidi_cache[i + incr].resolved_level < level)
 709                 return i;
 710               i++;
 711             }
 712         }
 713     }
 714
 715   return -1;
 716 }
 717
 718 static void
 719 bidi_cache_ensure_space (ptrdiff_t idx)
 720 {
 721   /* Enlarge the cache as needed.  */
 722   if (idx >= bidi_cache_size)
 723     {
 724       /* The bidi cache cannot be larger than the largest Lisp string
 725          or buffer.  */
 726       ptrdiff_t string_or_buffer_bound
 727         = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
 728
 729       /* Also, it cannot be larger than what C can represent.  */
 730       ptrdiff_t c_bound
 731         = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
 732
 733       bidi_cache
 734         = xpalloc (bidi_cache, &bidi_cache_size,
 735                    max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
 736                    min (string_or_buffer_bound, c_bound), elsz);
 737     }
 738 }
 739
 740 static void
 741 bidi_cache_iterator_state (struct bidi_it *bidi_it, bool resolved,
 742                            bool update_only)
 743 {
 744   ptrdiff_t idx;
 745
 746   /* We should never cache on backward scans.  */
 747   if (bidi_it->scan_dir == -1)
 748     emacs_abort ();
 749   idx = bidi_cache_search (bidi_it->charpos, -1, 1);
 750
 751   if (idx < 0 && update_only)
 752     return;
 753
 754   if (idx < 0)
 755     {
 756       idx = bidi_cache_idx;
 757       bidi_cache_ensure_space (idx);
 758       /* Character positions should correspond to cache positions 1:1.
 759          If we are outside the range of cached positions, the cache is
 760          useless and must be reset.  */
 761       if (idx > bidi_cache_start &&
 762           (bidi_it->charpos > (bidi_cache[idx - 1].charpos
 763                                + bidi_cache[idx - 1].nchars)
 764            || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
 765         {
 766           bidi_cache_reset ();
 767           idx = bidi_cache_start;
 768         }
 769       if (bidi_it->nchars <= 0)
 770         emacs_abort ();
 771       bidi_copy_it (&bidi_cache[idx], bidi_it);
 772       if (!resolved)
 773         bidi_cache[idx].resolved_level = -1;
 774     }
 775   else
 776     {
 777       /* Copy only the members which could have changed, to avoid
 778          costly copying of the entire struct.  */
 779       bidi_cache[idx].type = bidi_it->type;
 780       bidi_check_type (bidi_it->type);
 781       bidi_cache[idx].type_after_wn = bidi_it->type_after_wn;
 782       bidi_check_type (bidi_it->type_after_wn);
 783       if (resolved)
 784         bidi_cache[idx].resolved_level = bidi_it->resolved_level;
 785       else
 786         bidi_cache[idx].resolved_level = -1;
 787       bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
 788       bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
 789       bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
 790       bidi_cache[idx].disp_pos = bidi_it->disp_pos;
 791       bidi_cache[idx].disp_prop = bidi_it->disp_prop;
 792       bidi_cache[idx].bracket_pairing_pos = bidi_it->bracket_pairing_pos;
 793       bidi_cache[idx].bracket_enclosed_type = bidi_it->bracket_enclosed_type;
 794     }
 795
 796   bidi_cache_last_idx = idx;
 797   if (idx >= bidi_cache_idx)
 798     bidi_cache_idx = idx + 1;
 799 }
 800
 801 /* Look for a cached iterator state that corresponds to CHARPOS.  If
 802    found, copy the cached state into BIDI_IT and return the type of
 803    the cached entry.  If not found, return UNKNOWN_BT.  RESOLVED_ONLY
 804    zero means it is OK to return cached states that were not fully
 805    resolved yet.  This can happen if the state was cached before it
 806    was resolved in bidi_resolve_neutral.  */
 807 static bidi_type_t
 808 bidi_cache_find (ptrdiff_t charpos, bool resolved_only, struct bidi_it *bidi_it)
 809 {
 810   ptrdiff_t i = bidi_cache_search (charpos, -1, bidi_it->scan_dir);
 811
 812   if (i >= bidi_cache_start
 813       && (!resolved_only
 814           /* Callers that want only fully resolved states (and set
 815              resolved_only = true) need to be sure that there's enough
 816              info in the cached state to return the state as final,
 817              and if not, they don't want the cached state.  */
 818           || bidi_cache[i].resolved_level >= 0))
 819     {
 820       bidi_dir_t current_scan_dir = bidi_it->scan_dir;
 821
 822       bidi_copy_it (bidi_it, &bidi_cache[i]);
 823       bidi_cache_last_idx = i;
 824       /* Don't let scan direction from the cached state override
 825          the current scan direction.  */
 826       bidi_it->scan_dir = current_scan_dir;
 827       return bidi_it->type;
 828     }
 829
 830   return UNKNOWN_BT;
 831 }
 832
 833 static int
 834 bidi_peek_at_next_level (struct bidi_it *bidi_it)
 835 {
 836   if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
 837     emacs_abort ();
 838   return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
 839 }
 840
 841 \f
 842 /***********************************************************************
 843              Pushing and popping the bidi iterator state
 844  ***********************************************************************/
 845
 846 /* Push the bidi iterator state in preparation for reordering a
 847    different object, e.g. display string found at certain buffer
 848    position.  Pushing the bidi iterator boils down to saving its
 849    entire state on the cache and starting a new cache "stacked" on top
 850    of the current cache.  */
 851 void
 852 bidi_push_it (struct bidi_it *bidi_it)
 853 {
 854   /* Save the current iterator state in its entirety after the last
 855      used cache slot.  */
 856   bidi_cache_ensure_space (bidi_cache_idx);
 857   bidi_cache[bidi_cache_idx++] = *bidi_it;
 858
 859   /* Push the current cache start onto the stack.  */
 860   eassert (bidi_cache_sp < IT_STACK_SIZE);
 861   bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
 862
 863   /* Start a new level of cache, and make it empty.  */
 864   bidi_cache_start = bidi_cache_idx;
 865   bidi_cache_last_idx = -1;
 866 }
 867
 868 /* Restore the iterator state saved by bidi_push_it and return the
 869    cache to the corresponding state.  */
 870 void
 871 bidi_pop_it (struct bidi_it *bidi_it)
 872 {
 873   if (bidi_cache_start <= 0)
 874     emacs_abort ();
 875
 876   /* Reset the next free cache slot index to what it was before the
 877      call to bidi_push_it.  */
 878   bidi_cache_idx = bidi_cache_start - 1;
 879
 880   /* Restore the bidi iterator state saved in the cache.  */
 881   *bidi_it = bidi_cache[bidi_cache_idx];
 882
 883   /* Pop the previous cache start from the stack.  */
 884   if (bidi_cache_sp <= 0)
 885     emacs_abort ();
 886   bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
 887
 888   /* Invalidate the last-used cache slot data.  */
 889   bidi_cache_last_idx = -1;
 890 }
 891
 892 static ptrdiff_t bidi_cache_total_alloc;
 893
 894 /* Stash away a copy of the cache and its control variables.  */
 895 void *
 896 bidi_shelve_cache (void)
 897 {
 898   unsigned char *databuf;
 899   ptrdiff_t alloc;
 900
 901   /* Empty cache.  */
 902   if (bidi_cache_idx == 0)
 903     return NULL;
 904
 905   alloc = (bidi_shelve_header_size
 906            + bidi_cache_idx * sizeof (struct bidi_it));
 907   databuf = xmalloc (alloc);
 908   bidi_cache_total_alloc += alloc;
 909
 910   memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
 911   memcpy (databuf + sizeof (bidi_cache_idx),
 912           bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
 913   memcpy (databuf + sizeof (bidi_cache_idx)
 914           + bidi_cache_idx * sizeof (struct bidi_it),
 915           bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
 916   memcpy (databuf + sizeof (bidi_cache_idx)
 917           + bidi_cache_idx * sizeof (struct bidi_it)
 918           + sizeof (bidi_cache_start_stack),
 919           &bidi_cache_sp, sizeof (bidi_cache_sp));
 920   memcpy (databuf + sizeof (bidi_cache_idx)
 921           + bidi_cache_idx * sizeof (struct bidi_it)
 922           + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
 923           &bidi_cache_start, sizeof (bidi_cache_start));
 924   memcpy (databuf + sizeof (bidi_cache_idx)
 925           + bidi_cache_idx * sizeof (struct bidi_it)
 926           + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
 927           + sizeof (bidi_cache_start),
 928           &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
 929
 930   return databuf;
 931 }
 932
 933 /* Restore the cache state from a copy stashed away by
 934    bidi_shelve_cache, and free the buffer used to stash that copy.
 935    JUST_FREE means free the buffer, but don't restore the
 936    cache; used when the corresponding iterator is discarded instead of
 937    being restored.  */
 938 void
 939 bidi_unshelve_cache (void *databuf, bool just_free)
 940 {
 941   unsigned char *p = databuf;
 942
 943   if (!p)
 944     {
 945       if (!just_free)
 946         {
 947           /* A NULL pointer means an empty cache.  */
 948           bidi_cache_start = 0;
 949           bidi_cache_sp = 0;
 950           bidi_cache_reset ();
 951         }
 952     }
 953   else
 954     {
 955       if (just_free)
 956         {
 957           ptrdiff_t idx;
 958
 959           memcpy (&idx, p, sizeof (bidi_cache_idx));
 960           bidi_cache_total_alloc
 961             -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
 962         }
 963       else
 964         {
 965           memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
 966           bidi_cache_ensure_space (bidi_cache_idx);
 967           memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
 968                   bidi_cache_idx * sizeof (struct bidi_it));
 969           memcpy (bidi_cache_start_stack,
 970                   p + sizeof (bidi_cache_idx)
 971                   + bidi_cache_idx * sizeof (struct bidi_it),
 972                   sizeof (bidi_cache_start_stack));
 973           memcpy (&bidi_cache_sp,
 974                   p + sizeof (bidi_cache_idx)
 975                   + bidi_cache_idx * sizeof (struct bidi_it)
 976                   + sizeof (bidi_cache_start_stack),
 977                   sizeof (bidi_cache_sp));
 978           memcpy (&bidi_cache_start,
 979                   p + sizeof (bidi_cache_idx)
 980                   + bidi_cache_idx * sizeof (struct bidi_it)
 981                   + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
 982                   sizeof (bidi_cache_start));
 983           memcpy (&bidi_cache_last_idx,
 984                   p + sizeof (bidi_cache_idx)
 985                   + bidi_cache_idx * sizeof (struct bidi_it)
 986                   + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
 987                   + sizeof (bidi_cache_start),
 988                   sizeof (bidi_cache_last_idx));
 989           bidi_cache_total_alloc
 990             -= (bidi_shelve_header_size
 991                 + bidi_cache_idx * sizeof (struct bidi_it));
 992         }
 993
 994       xfree (p);
 995     }
 996 }
 997
 998 \f
 999 /***********************************************************************
1000                         Initialization
1001  ***********************************************************************/
1002 static void
1003 bidi_initialize (void)
1004 {
1005   bidi_type_table = uniprop_table (intern ("bidi-class"));
1006   if (NILP (bidi_type_table))
1007     emacs_abort ();
1008   staticpro (&bidi_type_table);
1009
1010   bidi_mirror_table = uniprop_table (intern ("mirroring"));
1011   if (NILP (bidi_mirror_table))
1012     emacs_abort ();
1013   staticpro (&bidi_mirror_table);
1014
1015   bidi_brackets_table = uniprop_table (intern ("bracket-type"));
1016   if (NILP (bidi_brackets_table))
1017     emacs_abort ();
1018   staticpro (&bidi_brackets_table);
1019
1020   Qparagraph_start = intern ("paragraph-start");
1021   staticpro (&Qparagraph_start);
1022   paragraph_start_re = Fsymbol_value (Qparagraph_start);
1023   if (!STRINGP (paragraph_start_re))
1024     paragraph_start_re = build_string ("\f\\|[ \t]*$");
1025   staticpro (&paragraph_start_re);
1026   Qparagraph_separate = intern ("paragraph-separate");
1027   staticpro (&Qparagraph_separate);
1028   paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
1029   if (!STRINGP (paragraph_separate_re))
1030     paragraph_separate_re = build_string ("[ \t\f]*$");
1031   staticpro (&paragraph_separate_re);
1032
1033   bidi_cache_sp = 0;
1034   bidi_cache_total_alloc = 0;
1035
1036   bidi_initialized = 1;
1037 }
1038
1039 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
1040    end.  */
1041 static void
1042 bidi_set_paragraph_end (struct bidi_it *bidi_it)
1043 {
1044   bidi_it->invalid_levels = 0;
1045   bidi_it->invalid_isolates = 0;
1046   bidi_it->stack_idx = 0;
1047   bidi_it->resolved_level = bidi_it->level_stack[0].level;
1048 }
1049
1050 /* Initialize the bidi iterator from buffer/string position CHARPOS.  */
1051 void
1052 bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, bool frame_window_p,
1053               struct bidi_it *bidi_it)
1054 {
1055   if (! bidi_initialized)
1056     bidi_initialize ();
1057   if (charpos >= 0)
1058     bidi_it->charpos = charpos;
1059   if (bytepos >= 0)
1060     bidi_it->bytepos = bytepos;
1061   bidi_it->frame_window_p = frame_window_p;
1062   bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit */
1063   bidi_it->first_elt = 1;
1064   bidi_set_paragraph_end (bidi_it);
1065   bidi_it->new_paragraph = 1;
1066   bidi_it->separator_limit = -1;
1067   bidi_it->type = NEUTRAL_B;
1068   bidi_it->type_after_wn = NEUTRAL_B;
1069   bidi_it->orig_type = NEUTRAL_B;
1070   /* FIXME: Review this!!! */
1071   bidi_it->prev.type = bidi_it->prev.orig_type = UNKNOWN_BT;
1072   bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
1073   bidi_it->next_for_neutral.charpos = -1;
1074   bidi_it->next_for_neutral.type
1075     = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
1076   bidi_it->prev_for_neutral.charpos = -1;
1077   bidi_it->prev_for_neutral.type
1078     = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
1079   bidi_it->sos = L2R;    /* FIXME: should it be user-selectable? */
1080   bidi_it->disp_pos = -1;       /* invalid/unknown */
1081   bidi_it->disp_prop = 0;
1082   /* We can only shrink the cache if we are at the bottom level of its
1083      "stack".  */
1084   if (bidi_cache_start == 0)
1085     bidi_cache_shrink ();
1086   else
1087     bidi_cache_reset ();
1088 }
1089
1090 /* Perform initializations for reordering a new line of bidi text.  */
1091 static void
1092 bidi_line_init (struct bidi_it *bidi_it)
1093 {
1094   bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
1095   bidi_it->stack_idx = 0;
1096   bidi_it->resolved_level = bidi_it->level_stack[0].level;
1097   bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
1098   bidi_it->level_stack[0].isolate_status = false; /* X1 */
1099   bidi_it->invalid_levels = 0;
1100   bidi_it->isolate_level = 0;    /* X1 */
1101   bidi_it->invalid_isolates = 0; /* X1 */
1102   /* Setting this to zero will force its recomputation the first time
1103      we need it for W5.  */
1104   bidi_it->next_en_pos = 0;
1105   bidi_it->next_en_type = UNKNOWN_BT;
1106   bidi_it->next_for_ws.charpos = -1;
1107   bidi_it->next_for_ws.type = UNKNOWN_BT;
1108   bidi_set_sos_type (bidi_it,
1109                      (bidi_it->paragraph_dir == R2L ? 1 : 0),
1110                      bidi_it->level_stack[0].level); /* X10 */
1111
1112   bidi_cache_reset ();
1113 }
1114
1115 \f
1116 /***********************************************************************
1117                         Fetching characters
1118  ***********************************************************************/
1119
1120 /* Count bytes in string S between BEG/BEGBYTE and END.  BEG and END
1121    are zero-based character positions in S, BEGBYTE is byte position
1122    corresponding to BEG.  UNIBYTE means S is a unibyte string.  */
1123 static ptrdiff_t
1124 bidi_count_bytes (const unsigned char *s, ptrdiff_t beg,
1125                   ptrdiff_t begbyte, ptrdiff_t end, bool unibyte)
1126 {
1127   ptrdiff_t pos = beg;
1128   const unsigned char *p = s + begbyte, *start = p;
1129
1130   if (unibyte)
1131     p = s + end;
1132   else
1133     {
1134       if (!CHAR_HEAD_P (*p))
1135         emacs_abort ();
1136
1137       while (pos < end)
1138         {
1139           p += BYTES_BY_CHAR_HEAD (*p);
1140           pos++;
1141         }
1142     }
1143
1144   return p - start;
1145 }
1146
1147 /* Fetch and return the character at byte position BYTEPOS.  If S is
1148    non-NULL, fetch the character from string S; otherwise fetch the
1149    character from the current buffer.  UNIBYTE means S is a
1150    unibyte string.  */
1151 static int
1152 bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, bool unibyte)
1153 {
1154   if (s)
1155     {
1156       s += bytepos;
1157       if (unibyte)
1158         return *s;
1159     }
1160   else
1161     s = BYTE_POS_ADDR (bytepos);
1162   return STRING_CHAR (s);
1163 }
1164
1165 /* Fetch and return the character at CHARPOS/BYTEPOS.  If that
1166    character is covered by a display string, treat the entire run of
1167    covered characters as a single character, either u+2029 or u+FFFC,
1168    and return their combined length in CH_LEN and NCHARS.  DISP_POS
1169    specifies the character position of the next display string, or -1
1170    if not yet computed.  When the next character is at or beyond that
1171    position, the function updates DISP_POS with the position of the
1172    next display string.  *DISP_PROP non-zero means that there's really
1173    a display string at DISP_POS, as opposed to when we searched till
1174    DISP_POS without finding one.  If *DISP_PROP is 2, it means the
1175    display spec is of the form `(space ...)', which is replaced with
1176    u+2029 to handle it as a paragraph separator.  STRING->s is the C
1177    string to iterate, or NULL if iterating over a buffer or a Lisp
1178    string; in the latter case, STRING->lstring is the Lisp string.  */
1179 static int
1180 bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos,
1181                  int *disp_prop, struct bidi_string_data *string,
1182                  struct window *w,
1183                  bool frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
1184 {
1185   int ch;
1186   ptrdiff_t endpos
1187     = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
1188   struct text_pos pos;
1189   int len;
1190
1191   /* If we got past the last known position of display string, compute
1192      the position of the next one.  That position could be at CHARPOS.  */
1193   if (charpos < endpos && charpos > *disp_pos)
1194     {
1195       SET_TEXT_POS (pos, charpos, bytepos);
1196       *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1197                                               disp_prop);
1198     }
1199
1200   /* Fetch the character at BYTEPOS.  */
1201   if (charpos >= endpos)
1202     {
1203       ch = BIDI_EOB;
1204       *ch_len = 1;
1205       *nchars = 1;
1206       *disp_pos = endpos;
1207       *disp_prop = 0;
1208     }
1209   else if (charpos >= *disp_pos && *disp_prop)
1210     {
1211       ptrdiff_t disp_end_pos;
1212
1213       /* We don't expect to find ourselves in the middle of a display
1214          property.  Hopefully, it will never be needed.  */
1215       if (charpos > *disp_pos)
1216         emacs_abort ();
1217       /* Text covered by `display' properties and overlays with
1218          display properties or display strings is handled as a single
1219          character that represents the entire run of characters
1220          covered by the display property.  */
1221       if (*disp_prop == 2)
1222         {
1223           /* `(space ...)' display specs are handled as paragraph
1224              separators for the purposes of the reordering; see UAX#9
1225              section 3 and clause HL1 in section 4.3 there.  */
1226           ch = 0x2029;
1227         }
1228       else
1229         {
1230           /* All other display specs are handled as the Unicode Object
1231              Replacement Character.  */
1232           ch = 0xFFFC;
1233         }
1234       disp_end_pos = compute_display_string_end (*disp_pos, string);
1235       if (disp_end_pos < 0)
1236         {
1237           /* Somebody removed the display string from the buffer
1238              behind our back.  Recover by processing this buffer
1239              position as if no display property were present there to
1240              begin with.  */
1241           *disp_prop = 0;
1242           goto normal_char;
1243         }
1244       *nchars = disp_end_pos - *disp_pos;
1245       if (*nchars <= 0)
1246         emacs_abort ();
1247       if (string->s)
1248         *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
1249                                     disp_end_pos, string->unibyte);
1250       else if (STRINGP (string->lstring))
1251         *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
1252                                     bytepos, disp_end_pos, string->unibyte);
1253       else
1254         *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
1255     }
1256   else
1257     {
1258     normal_char:
1259       if (string->s)
1260         {
1261
1262           if (!string->unibyte)
1263             {
1264               ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1265               *ch_len = len;
1266             }
1267           else
1268             {
1269               ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1270               *ch_len = 1;
1271             }
1272         }
1273       else if (STRINGP (string->lstring))
1274         {
1275           if (!string->unibyte)
1276             {
1277               ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1278                                            len);
1279               *ch_len = len;
1280             }
1281           else
1282             {
1283               ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1284               *ch_len = 1;
1285             }
1286         }
1287       else
1288         {
1289           ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
1290           *ch_len = len;
1291         }
1292       *nchars = 1;
1293     }
1294
1295   /* If we just entered a run of characters covered by a display
1296      string, compute the position of the next display string.  */
1297   if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
1298       && *disp_prop)
1299     {
1300       SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
1301       *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1302                                               disp_prop);
1303     }
1304
1305   return ch;
1306 }
1307
1308 /* Like bidi_fetch_char, but ignore any text between an isolate
1309    initiator and its matching PDI or, if it has no matching PDI, the
1310    end of the paragraph.  If isolates were skipped, CH_LEN and NCHARS
1311    are set to the number of bytes and characters between BYTEPOS/CHARPOS
1312    and the character that was fetched after skipping the isolates.  */
1313 static int
1314 bidi_fetch_char_skip_isolates (ptrdiff_t charpos, ptrdiff_t bytepos,
1315                                ptrdiff_t *disp_pos, int *disp_prop,
1316                                struct bidi_string_data *string,
1317                                struct window *w, bool frame_window_p,
1318                                ptrdiff_t *ch_len, ptrdiff_t *nchars)
1319 {
1320   ptrdiff_t orig_charpos = charpos, orig_bytepos = bytepos;
1321   int ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string, w,
1322                             frame_window_p, ch_len, nchars);
1323   bidi_type_t ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1324   ptrdiff_t level = 0;
1325
1326   if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1327     {
1328       level++;
1329       while (level > 0 && ch_type != NEUTRAL_B)
1330         {
1331           charpos += *nchars;
1332           bytepos += *ch_len;
1333           ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string,
1334                                 w, frame_window_p, ch_len, nchars);
1335           ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1336           /* A Note to P2 says to ignore max_depth limit.  */
1337           if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1338             level++;
1339           else if (ch_type == PDI)
1340             level--;
1341         }
1342     }
1343
1344   /* Communicate to the caller how much did we skip, so it could get
1345      past the last character position we examined.  */
1346   *nchars += charpos - orig_charpos;
1347   *ch_len += bytepos - orig_bytepos;
1348   return ch;
1349 }
1350
1351
1352 \f
1353 /***********************************************************************
1354                         Determining paragraph direction
1355  ***********************************************************************/
1356
1357 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1358    Value is the non-negative length of the paragraph separator
1359    following the buffer position, -1 if position is at the beginning
1360    of a new paragraph, or -2 if position is neither at beginning nor
1361    at end of a paragraph.  */
1362 static ptrdiff_t
1363 bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
1364 {
1365   Lisp_Object sep_re;
1366   Lisp_Object start_re;
1367   ptrdiff_t val;
1368
1369   sep_re = paragraph_separate_re;
1370   start_re = paragraph_start_re;
1371
1372   val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1373   if (val < 0)
1374     {
1375       if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1376         val = -1;
1377       else
1378         val = -2;
1379     }
1380
1381   return val;
1382 }
1383
1384 /* If the user has requested the long scans caching, make sure that
1385    BIDI cache is enabled.  Otherwise, make sure it's disabled.  */
1386
1387 static struct region_cache *
1388 bidi_paragraph_cache_on_off (void)
1389 {
1390   struct buffer *cache_buffer = current_buffer;
1391   bool indirect_p = false;
1392
1393   /* For indirect buffers, make sure to use the cache of their base
1394      buffer.  */
1395   if (cache_buffer->base_buffer)
1396     {
1397       cache_buffer = cache_buffer->base_buffer;
1398       indirect_p = true;
1399     }
1400
1401   /* Don't turn on or off the cache in the base buffer, if the value
1402      of cache-long-scans of the base buffer is inconsistent with that.
1403      This is because doing so will just make the cache pure overhead,
1404      since if we turn it on via indirect buffer, it will be
1405      immediately turned off by its base buffer.  */
1406   if (NILP (BVAR (current_buffer, cache_long_scans)))
1407     {
1408       if (!indirect_p
1409           || NILP (BVAR (cache_buffer, cache_long_scans)))
1410         {
1411           if (cache_buffer->bidi_paragraph_cache)
1412             {
1413               free_region_cache (cache_buffer->bidi_paragraph_cache);
1414               cache_buffer->bidi_paragraph_cache = 0;
1415             }
1416         }
1417       return NULL;
1418     }
1419   else
1420     {
1421       if (!indirect_p
1422           || !NILP (BVAR (cache_buffer, cache_long_scans)))
1423         {
1424           if (!cache_buffer->bidi_paragraph_cache)
1425             cache_buffer->bidi_paragraph_cache = new_region_cache ();
1426         }
1427       return cache_buffer->bidi_paragraph_cache;
1428     }
1429 }
1430
1431 /* On my 2005-vintage machine, searching back for paragraph start
1432    takes ~1 ms per line.  And bidi_paragraph_init is called 4 times
1433    when user types C-p.  The number below limits each call to
1434    bidi_paragraph_init to about 10 ms.  */
1435 #define MAX_PARAGRAPH_SEARCH 7500
1436
1437 /* Find the beginning of this paragraph by looking back in the buffer.
1438    Value is the byte position of the paragraph's beginning, or
1439    BEGV_BYTE if paragraph_start_re is still not found after looking
1440    back MAX_PARAGRAPH_SEARCH lines in the buffer.  */
1441 static ptrdiff_t
1442 bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
1443 {
1444   Lisp_Object re = paragraph_start_re;
1445   ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
1446   struct region_cache *bpc = bidi_paragraph_cache_on_off ();
1447   ptrdiff_t n = 0, oldpos = pos, next;
1448   struct buffer *cache_buffer = current_buffer;
1449
1450   if (cache_buffer->base_buffer)
1451     cache_buffer = cache_buffer->base_buffer;
1452
1453   while (pos_byte > BEGV_BYTE
1454          && n++ < MAX_PARAGRAPH_SEARCH
1455          && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1456     {
1457       /* FIXME: What if the paragraph beginning is covered by a
1458          display string?  And what if a display string covering some
1459          of the text over which we scan back includes
1460          paragraph_start_re?  */
1461       DEC_BOTH (pos, pos_byte);
1462       if (bpc && region_cache_backward (cache_buffer, bpc, pos, &next))
1463         {
1464           pos = next, pos_byte = CHAR_TO_BYTE (pos);
1465           break;
1466         }
1467       else
1468         pos = find_newline_no_quit (pos, pos_byte, -1, &pos_byte);
1469     }
1470   if (n >= MAX_PARAGRAPH_SEARCH)
1471     pos = BEGV, pos_byte = BEGV_BYTE;
1472   if (bpc)
1473     know_region_cache (cache_buffer, bpc, pos, oldpos);
1474   /* Positions returned by the region cache are not limited to
1475      BEGV..ZV range, so we limit them here.  */
1476   pos_byte = clip_to_bounds (BEGV_BYTE, pos_byte, ZV_BYTE);
1477   return pos_byte;
1478 }
1479
1480 /* On a 3.4 GHz machine, searching forward for a strong directional
1481    character in a long paragraph full of weaks or neutrals takes about
1482    1 ms for each 20K characters.  The number below limits each call to
1483    bidi_paragraph_init to less than 10 ms even on slow machines.  */
1484 #define MAX_STRONG_CHAR_SEARCH 100000
1485
1486 /* Starting from POS, find the first strong (L, R, or AL) character,
1487    while skipping over any characters between an isolate initiator and
1488    its matching PDI.  STOP_AT_PDI non-zero means stop at the PDI that
1489    matches the isolate initiator at POS.  Return the bidi type of the
1490    character where the search stopped.  Give up if after examining
1491    MAX_STRONG_CHAR_SEARCH buffer or string positions no strong
1492    character was found.  */
1493 static bidi_type_t
1494 find_first_strong_char (ptrdiff_t pos, ptrdiff_t bytepos, ptrdiff_t end,
1495                         ptrdiff_t *disp_pos, int *disp_prop,
1496                         struct bidi_string_data *string, struct window *w,
1497                         bool string_p, bool frame_window_p,
1498                         ptrdiff_t *ch_len, ptrdiff_t *nchars, bool stop_at_pdi)
1499 {
1500   ptrdiff_t pos1;
1501   bidi_type_t type;
1502   int ch;
1503
1504   if (stop_at_pdi)
1505     {
1506       /* If STOP_AT_PDI is non-zero, we must have been called with FSI
1507          at POS.  Get past it.  */
1508 #ifdef ENABLE_CHECKING
1509       ch = bidi_fetch_char (pos, bytepos, disp_pos, disp_prop, string, w,
1510                             frame_window_p, ch_len, nchars);
1511       type = bidi_get_type (ch, NEUTRAL_DIR);
1512       eassert (type == FSI /* || type == LRI || type == RLI */);
1513 #endif
1514       pos += *nchars;
1515       bytepos += *ch_len;
1516     }
1517   ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop, string,
1518                                       w, frame_window_p, ch_len, nchars);
1519   type = bidi_get_type (ch, NEUTRAL_DIR);
1520
1521   pos1 = pos;
1522   for (pos += *nchars, bytepos += *ch_len;
1523        bidi_get_category (type) != STRONG
1524          /* If requested to stop at first PDI, stop there.  */
1525          && !(stop_at_pdi && type == PDI)
1526          /* Stop when searched too far into an abnormally large
1527             paragraph full of weak or neutral characters.  */
1528          && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
1529        type = bidi_get_type (ch, NEUTRAL_DIR))
1530     {
1531       if (pos >= end)
1532         {
1533           /* Pretend there's a paragraph separator at end of
1534              buffer/string.  */
1535           type = NEUTRAL_B;
1536           break;
1537         }
1538       if (!string_p
1539           && type == NEUTRAL_B
1540           && bidi_at_paragraph_end (pos, bytepos) >= -1)
1541         break;
1542       /* Fetch next character and advance to get past it.  */
1543       ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop,
1544                                           string, w, frame_window_p,
1545                                           ch_len, nchars);
1546       pos += *nchars;
1547       bytepos += *ch_len;
1548     }
1549   return type;
1550 }
1551
1552 /* Determine the base direction, a.k.a. base embedding level, of the
1553    paragraph we are about to iterate through.  If DIR is either L2R or
1554    R2L, just use that.  Otherwise, determine the paragraph direction
1555    from the first strong directional character of the paragraph.
1556
1557    NO_DEFAULT_P means don't default to L2R if the paragraph
1558    has no strong directional characters and both DIR and
1559    bidi_it->paragraph_dir are NEUTRAL_DIR.  In that case, search back
1560    in the buffer until a paragraph is found with a strong character,
1561    or until hitting BEGV.  In the latter case, fall back to L2R.  This
1562    flag is used in current-bidi-paragraph-direction.
1563
1564    Note that this function gives the paragraph separator the same
1565    direction as the preceding paragraph, even though Emacs generally
1566    views the separator as not belonging to any paragraph.  */
1567 void
1568 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, bool no_default_p)
1569 {
1570   ptrdiff_t bytepos = bidi_it->bytepos;
1571   bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1572   ptrdiff_t pstartbyte;
1573   /* Note that begbyte is a byte position, while end is a character
1574      position.  Yes, this is ugly, but we are trying to avoid costly
1575      calls to BYTE_TO_CHAR and its ilk.  */
1576   ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1577   ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
1578
1579   /* Special case for an empty buffer. */
1580   if (bytepos == begbyte && bidi_it->charpos == end)
1581     dir = L2R;
1582   /* We should never be called at EOB or before BEGV.  */
1583   else if (bidi_it->charpos >= end || bytepos < begbyte)
1584     emacs_abort ();
1585
1586   if (dir == L2R)
1587     {
1588       bidi_it->paragraph_dir = L2R;
1589       bidi_it->new_paragraph = 0;
1590     }
1591   else if (dir == R2L)
1592     {
1593       bidi_it->paragraph_dir = R2L;
1594       bidi_it->new_paragraph = 0;
1595     }
1596   else if (dir == NEUTRAL_DIR)  /* P2 */
1597     {
1598       ptrdiff_t ch_len, nchars;
1599       ptrdiff_t pos, disp_pos = -1;
1600       int disp_prop = 0;
1601       bidi_type_t type;
1602       const unsigned char *s;
1603
1604       if (!bidi_initialized)
1605         bidi_initialize ();
1606
1607       /* If we are inside a paragraph separator, we are just waiting
1608          for the separator to be exhausted; use the previous paragraph
1609          direction.  But don't do that if we have been just reseated,
1610          because we need to reinitialize below in that case.  */
1611       if (!bidi_it->first_elt
1612           && bidi_it->charpos < bidi_it->separator_limit)
1613         return;
1614
1615       /* If we are on a newline, get past it to where the next
1616          paragraph might start.  But don't do that at BEGV since then
1617          we are potentially in a new paragraph that doesn't yet
1618          exist.  */
1619       pos = bidi_it->charpos;
1620       s = (STRINGP (bidi_it->string.lstring)
1621            ? SDATA (bidi_it->string.lstring)
1622            : bidi_it->string.s);
1623       if (bytepos > begbyte
1624           && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1625         {
1626           bytepos++;
1627           pos++;
1628         }
1629
1630       /* We are either at the beginning of a paragraph or in the
1631          middle of it.  Find where this paragraph starts.  */
1632       if (string_p)
1633         {
1634           /* We don't support changes of paragraph direction inside a
1635              string.  It is treated as a single paragraph.  */
1636           pstartbyte = 0;
1637         }
1638       else
1639         pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1640       bidi_it->separator_limit = -1;
1641       bidi_it->new_paragraph = 0;
1642
1643       /* The following loop is run more than once only if NO_DEFAULT_P,
1644          and only if we are iterating on a buffer.  */
1645       do {
1646         bytepos = pstartbyte;
1647         if (!string_p)
1648           pos = BYTE_TO_CHAR (bytepos);
1649         type = find_first_strong_char (pos, bytepos, end, &disp_pos, &disp_prop,
1650                                        &bidi_it->string, bidi_it->w,
1651                                        string_p, bidi_it->frame_window_p,
1652                                        &ch_len, &nchars, false);
1653         if (type == STRONG_R || type == STRONG_AL) /* P3 */
1654           bidi_it->paragraph_dir = R2L;
1655         else if (type == STRONG_L)
1656           bidi_it->paragraph_dir = L2R;
1657         if (!string_p
1658             && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1659           {
1660             /* If this paragraph is at BEGV, default to L2R.  */
1661             if (pstartbyte == BEGV_BYTE)
1662               bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1663             else
1664               {
1665                 ptrdiff_t prevpbyte = pstartbyte;
1666                 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1667
1668                 /* Find the beginning of the previous paragraph, if any.  */
1669                 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1670                   {
1671                     /* FXIME: What if p is covered by a display
1672                        string?  See also a FIXME inside
1673                        bidi_find_paragraph_start.  */
1674                     DEC_BOTH (p, pbyte);
1675                     prevpbyte = bidi_find_paragraph_start (p, pbyte);
1676                   }
1677                 pstartbyte = prevpbyte;
1678               }
1679           }
1680       } while (!string_p
1681                && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1682     }
1683   else
1684     emacs_abort ();
1685
1686   /* Contrary to UAX#9 clause P3, we only default the paragraph
1687      direction to L2R if we have no previous usable paragraph
1688      direction.  This is allowed by the HL1 clause.  */
1689   if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1690     bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1691   if (bidi_it->paragraph_dir == R2L)
1692     bidi_it->level_stack[0].level = 1;
1693   else
1694     bidi_it->level_stack[0].level = 0;
1695
1696   bidi_line_init (bidi_it);
1697 }
1698
1699 \f
1700 /***********************************************************************
1701                  Resolving explicit and implicit levels.
1702   The rest of this file constitutes the core of the UBA implementation.
1703  ***********************************************************************/
1704
1705 static bool
1706 bidi_explicit_dir_char (int ch)
1707 {
1708   bidi_type_t ch_type;
1709
1710   if (!bidi_initialized)
1711     emacs_abort ();
1712   ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1713   return (ch_type == LRE || ch_type == LRO
1714           || ch_type == RLE || ch_type == RLO
1715           || ch_type == PDF);
1716 }
1717
1718 /* Given an iterator state in BIDI_IT, advance one character position
1719    in the buffer/string to the next character (in the logical order),
1720    resolve any explicit embeddings, directional overrides, and isolate
1721    initiators and terminators, and return the embedding level of the
1722    character after resolving these explicit directives.  */
1723 static int
1724 bidi_resolve_explicit (struct bidi_it *bidi_it)
1725 {
1726   int curchar;
1727   bidi_type_t type, typ1, prev_type = UNKNOWN_BT;
1728   int current_level;
1729   int new_level;
1730   bidi_dir_t override;
1731   bool isolate_status;
1732   bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1733   ptrdiff_t ch_len, nchars, disp_pos, end;
1734   int disp_prop;
1735
1736   /* Record the info about the previous character.  */
1737   if (bidi_it->type_after_wn != WEAK_BN /* W1/Retaining */
1738       && bidi_it->type != WEAK_BN)
1739     {
1740       /* This special case is needed in support of Unicode 8.0
1741          correction to N0, as implemented in bidi_resolve_weak/W1
1742          below.  */
1743       if (bidi_it->type_after_wn == NEUTRAL_ON
1744           && bidi_get_category (bidi_it->type) == STRONG
1745           && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
1746         bidi_remember_char (&bidi_it->prev, bidi_it, 1);
1747       else
1748         bidi_remember_char (&bidi_it->prev, bidi_it, 0);
1749     }
1750   if (bidi_it->type_after_wn == STRONG_R
1751       || bidi_it->type_after_wn == STRONG_L
1752       || bidi_it->type_after_wn == STRONG_AL)
1753     bidi_remember_char (&bidi_it->last_strong, bidi_it, 0);
1754   if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1755       || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1756     bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it, 1);
1757
1758   /* If we overstepped the characters used for resolving neutrals
1759      and whitespace, invalidate their info in the iterator.  */
1760   if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1761     bidi_it->next_for_neutral.type = UNKNOWN_BT;
1762   if (bidi_it->next_en_pos >= 0
1763       && bidi_it->charpos >= bidi_it->next_en_pos)
1764     {
1765       bidi_it->next_en_pos = 0;
1766       bidi_it->next_en_type = UNKNOWN_BT;
1767     }
1768
1769   /* Reset the bracket resolution info.  */
1770   bidi_it->bracket_pairing_pos = -1;
1771   bidi_it->bracket_enclosed_type = UNKNOWN_BT;
1772
1773   /* If reseat()'ed, don't advance, so as to start iteration from the
1774      position where we were reseated.  bidi_it->bytepos can be less
1775      than BEGV_BYTE after reseat to BEGV.  */
1776   if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1777       || bidi_it->first_elt)
1778     {
1779       bidi_it->first_elt = 0;
1780       if (string_p)
1781         {
1782           const unsigned char *p
1783             = (STRINGP (bidi_it->string.lstring)
1784                ? SDATA (bidi_it->string.lstring)
1785                : bidi_it->string.s);
1786
1787           if (bidi_it->charpos < 0)
1788             bidi_it->charpos = bidi_it->bytepos = 0;
1789           eassert (bidi_it->bytepos == bidi_count_bytes (p, 0, 0,
1790                                                          bidi_it->charpos,
1791                                                          bidi_it->string.unibyte));
1792         }
1793       else
1794         {
1795           if (bidi_it->charpos < BEGV)
1796             {
1797               bidi_it->charpos = BEGV;
1798               bidi_it->bytepos = BEGV_BYTE;
1799             }
1800           eassert (bidi_it->bytepos == CHAR_TO_BYTE (bidi_it->charpos));
1801         }
1802       /* Determine the original bidi type of the previous character,
1803          which is needed for handling isolate initiators and PDF.  The
1804          type of the previous character will be non-trivial only if
1805          our caller moved through some previous text in
1806          get_visually_first_element, in which case bidi_it->prev holds
1807          the information we want.  */
1808       if (bidi_it->first_elt && bidi_it->prev.type != UNKNOWN_BT)
1809         {
1810           eassert (bidi_it->prev.charpos == bidi_it->charpos - 1);
1811           prev_type = bidi_it->prev.orig_type;
1812           if (prev_type == FSI)
1813             prev_type = bidi_it->type_after_wn;
1814         }
1815     }
1816   /* Don't move at end of buffer/string.  */
1817   else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1818     {
1819       /* Advance to the next character, skipping characters covered by
1820          display strings (nchars > 1).  */
1821       if (bidi_it->nchars <= 0)
1822         emacs_abort ();
1823       bidi_it->charpos += bidi_it->nchars;
1824       if (bidi_it->ch_len == 0)
1825         emacs_abort ();
1826       bidi_it->bytepos += bidi_it->ch_len;
1827       prev_type = bidi_it->orig_type;
1828       if (prev_type == FSI)
1829         prev_type = bidi_it->type_after_wn;
1830     }
1831   else  /* EOB or end of string */
1832     prev_type = NEUTRAL_B;
1833
1834   current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1835   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1836   isolate_status = bidi_it->level_stack[bidi_it->stack_idx].isolate_status;
1837   new_level = current_level;
1838
1839   if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1840     {
1841       curchar = BIDI_EOB;
1842       bidi_it->ch_len = 1;
1843       bidi_it->nchars = 1;
1844       bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1845       bidi_it->disp_prop = 0;
1846     }
1847   else
1848     {
1849       /* LRI, RLI, and FSI increment, and PDF decrements, the
1850          embedding level of the _following_ characters, so we must
1851          first look at the type of the previous character to support
1852          that.  */
1853       switch (prev_type)
1854         {
1855         case RLI:       /* X5a */
1856           if (current_level < BIDI_MAXDEPTH
1857               && bidi_it->invalid_levels == 0
1858               && bidi_it->invalid_isolates == 0)
1859             {
1860               new_level = ((current_level + 1) & ~1) + 1;
1861               bidi_it->isolate_level++;
1862               bidi_push_embedding_level (bidi_it, new_level,
1863                                          NEUTRAL_DIR, true);
1864             }
1865           else
1866             bidi_it->invalid_isolates++;
1867           break;
1868         case LRI:       /* X5b */
1869           if (current_level < BIDI_MAXDEPTH - 1
1870               && bidi_it->invalid_levels == 0
1871               && bidi_it->invalid_isolates == 0)
1872             {
1873               new_level = ((current_level + 2) & ~1);
1874               bidi_it->isolate_level++;
1875               bidi_push_embedding_level (bidi_it, new_level,
1876                                          NEUTRAL_DIR, true);
1877             }
1878           else
1879             bidi_it->invalid_isolates++;
1880           break;
1881         case PDF:       /* X7 */
1882           if (!bidi_it->invalid_isolates)
1883             {
1884               if (bidi_it->invalid_levels)
1885                 bidi_it->invalid_levels--;
1886               else if (!isolate_status && bidi_it->stack_idx >= 1)
1887                 new_level = bidi_pop_embedding_level (bidi_it);
1888             }
1889           break;
1890         default:
1891           eassert (prev_type != FSI);
1892           /* Nothing.  */
1893           break;
1894         }
1895       /* Fetch the character at BYTEPOS.  If it is covered by a
1896          display string, treat the entire run of covered characters as
1897          a single character u+FFFC.  */
1898       curchar = bidi_fetch_char (bidi_it->charpos, bidi_it->bytepos,
1899                                  &bidi_it->disp_pos, &bidi_it->disp_prop,
1900                                  &bidi_it->string, bidi_it->w,
1901                                  bidi_it->frame_window_p,
1902                                  &bidi_it->ch_len, &bidi_it->nchars);
1903     }
1904   bidi_it->ch = curchar;
1905   bidi_it->resolved_level = new_level;
1906
1907   /* Don't apply directional override here, as all the types we handle
1908      below will not be affected by the override anyway, and we need
1909      the original type unaltered.  The override will be applied in
1910      bidi_resolve_weak.  */
1911   type = bidi_get_type (curchar, NEUTRAL_DIR);
1912   bidi_it->orig_type = type;
1913   bidi_check_type (bidi_it->orig_type);
1914
1915   bidi_it->type_after_wn = UNKNOWN_BT;
1916
1917   switch (type)
1918     {
1919     case RLE:   /* X2 */
1920     case RLO:   /* X4 */
1921       bidi_it->type_after_wn = type;
1922       bidi_check_type (bidi_it->type_after_wn);
1923       type = WEAK_BN; /* X9/Retaining */
1924       if (new_level < BIDI_MAXDEPTH
1925           && bidi_it->invalid_levels == 0
1926           && bidi_it->invalid_isolates == 0)
1927         {
1928           /* Compute the least odd embedding level greater than
1929              the current level.  */
1930           new_level = ((new_level + 1) & ~1) + 1;
1931           if (bidi_it->type_after_wn == RLE)
1932             override = NEUTRAL_DIR;
1933           else
1934             override = R2L;
1935           bidi_push_embedding_level (bidi_it, new_level, override, false);
1936           bidi_it->resolved_level = new_level;
1937         }
1938       else
1939         {
1940           if (bidi_it->invalid_isolates == 0)
1941             bidi_it->invalid_levels++;
1942         }
1943       break;
1944     case LRE:   /* X3 */
1945     case LRO:   /* X5 */
1946       bidi_it->type_after_wn = type;
1947       bidi_check_type (bidi_it->type_after_wn);
1948       type = WEAK_BN; /* X9/Retaining */
1949       if (new_level < BIDI_MAXDEPTH - 1
1950           && bidi_it->invalid_levels == 0
1951           && bidi_it->invalid_isolates == 0)
1952         {
1953           /* Compute the least even embedding level greater than
1954              the current level.  */
1955           new_level = ((new_level + 2) & ~1);
1956           if (bidi_it->type_after_wn == LRE)
1957             override = NEUTRAL_DIR;
1958           else
1959             override = L2R;
1960           bidi_push_embedding_level (bidi_it, new_level, override, false);
1961           bidi_it->resolved_level = new_level;
1962         }
1963       else
1964         {
1965           if (bidi_it->invalid_isolates == 0)
1966             bidi_it->invalid_levels++;
1967         }
1968       break;
1969     case FSI:   /* X5c */
1970       end = string_p ? bidi_it->string.schars : ZV;
1971       disp_pos = bidi_it->disp_pos;
1972       disp_prop = bidi_it->disp_prop;
1973       nchars = bidi_it->nchars;
1974       ch_len = bidi_it->ch_len;
1975       typ1 = find_first_strong_char (bidi_it->charpos,
1976                                      bidi_it->bytepos, end,
1977                                      &disp_pos, &disp_prop,
1978                                      &bidi_it->string, bidi_it->w,
1979                                      string_p, bidi_it->frame_window_p,
1980                                      &ch_len, &nchars, true);
1981       if (typ1 != STRONG_R && typ1 != STRONG_AL)
1982         {
1983           type = LRI;
1984           goto fsi_as_lri;
1985         }
1986       else
1987         type = RLI;
1988       /* FALLTHROUGH */
1989     case RLI:   /* X5a */
1990       if (override == NEUTRAL_DIR)
1991         bidi_it->type_after_wn = type;
1992       else      /* Unicode 8.0 correction.  */
1993         bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
1994       bidi_check_type (bidi_it->type_after_wn);
1995       break;
1996     case LRI:   /* X5b */
1997     fsi_as_lri:
1998       if (override == NEUTRAL_DIR)
1999         bidi_it->type_after_wn = type;
2000       else      /* Unicode 8.0 correction.  */
2001         bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2002       bidi_check_type (bidi_it->type_after_wn);
2003       break;
2004     case PDI:   /* X6a */
2005       if (bidi_it->invalid_isolates)
2006         bidi_it->invalid_isolates--;
2007       else if (bidi_it->isolate_level > 0)
2008         {
2009           bidi_it->invalid_levels = 0;
2010           while (!bidi_it->level_stack[bidi_it->stack_idx].isolate_status)
2011             bidi_pop_embedding_level (bidi_it);
2012           eassert (bidi_it->stack_idx > 0);
2013           new_level = bidi_pop_embedding_level (bidi_it);
2014           bidi_it->isolate_level--;
2015         }
2016       bidi_it->resolved_level = new_level;
2017       /* Unicode 8.0 correction.  */
2018       if (bidi_it->level_stack[bidi_it->stack_idx].override == L2R)
2019         bidi_it->type_after_wn = STRONG_L;
2020       else if (bidi_it->level_stack[bidi_it->stack_idx].override == R2L)
2021         bidi_it->type_after_wn = STRONG_R;
2022       else
2023         bidi_it->type_after_wn = type;
2024       break;
2025     case PDF:   /* X7 */
2026       bidi_it->type_after_wn = type;
2027       bidi_check_type (bidi_it->type_after_wn);
2028       type = WEAK_BN; /* X9/Retaining */
2029       break;
2030     default:
2031       /* Nothing.  */
2032       break;
2033     }
2034
2035   bidi_it->type = type;
2036   bidi_check_type (bidi_it->type);
2037
2038   if (bidi_it->type == NEUTRAL_B)       /* X8 */
2039     {
2040       bidi_set_paragraph_end (bidi_it);
2041       /* This is needed by bidi_resolve_weak below, and in L1.  */
2042       bidi_it->type_after_wn = bidi_it->type;
2043     }
2044
2045   eassert (bidi_it->resolved_level >= 0);
2046   return bidi_it->resolved_level;
2047 }
2048
2049 /* Advance in the buffer/string, resolve weak types and return the
2050    type of the next character after weak type resolution.  */
2051 static bidi_type_t
2052 bidi_resolve_weak (struct bidi_it *bidi_it)
2053 {
2054   bidi_type_t type;
2055   bidi_dir_t override;
2056   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2057   int new_level  = bidi_resolve_explicit (bidi_it);
2058   int next_char;
2059   bidi_type_t type_of_next;
2060   struct bidi_it saved_it;
2061   ptrdiff_t eob
2062     = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
2063        ? bidi_it->string.schars : ZV);
2064
2065   type = bidi_it->type;
2066   override = bidi_it->level_stack[bidi_it->stack_idx].override;
2067
2068   eassert (!(type == UNKNOWN_BT
2069              || type == LRE
2070              || type == LRO
2071              || type == RLE
2072              || type == RLO
2073              || type == PDF));
2074
2075   eassert (prev_level >= 0);
2076   if (bidi_it->type == NEUTRAL_B)
2077     {
2078       /* We've got a new isolating sequence, compute the directional
2079          type of sos and initialize per-run variables (UAX#9, clause
2080          X10).  */
2081       bidi_set_sos_type (bidi_it, prev_level, new_level);
2082     }
2083   if (type == NEUTRAL_S || type == NEUTRAL_WS
2084       || type == WEAK_BN || type == STRONG_AL)
2085     bidi_it->type_after_wn = type;      /* needed in L1 */
2086   bidi_check_type (bidi_it->type_after_wn);
2087
2088   /* Level and directional override status are already recorded in
2089      bidi_it, and do not need any change; see X6.  */
2090   if (override == R2L)          /* X6 */
2091     type = STRONG_R;
2092   else if (override == L2R)
2093     type = STRONG_L;
2094   else
2095     {
2096       if (type == WEAK_NSM)     /* W1 */
2097         {
2098           /* Note that we don't need to consider the case where the
2099              prev character has its type overridden by an RLO or LRO,
2100              because then either the type of this NSM would have been
2101              also overridden, or the previous character is outside the
2102              current level run, and thus not relevant to this NSM.
2103              This is why NSM gets the type_after_wn of the previous
2104              character.  */
2105           /* bidi_set_sos_type sets type_after_wn to UNKNOWN_BT.  */
2106           if (bidi_it->prev.type != UNKNOWN_BT
2107               /* If type_after_wn is NEUTRAL_B, this NSM is at sos.  */
2108               && bidi_it->prev.type != NEUTRAL_B)
2109             {
2110               if (bidi_isolate_fmt_char (bidi_it->prev.type))
2111                 {
2112                   /* From W1: "Note that in an isolating run sequence,
2113                      an isolate initiator followed by an NSM or any
2114                      type other than PDI must be an overflow isolate
2115                      initiator."  */
2116                   eassert (bidi_it->invalid_isolates > 0);
2117                   type = NEUTRAL_ON;
2118                 }
2119               else
2120                 {
2121                   /* This includes the Unicode 8.0 correction for N0,
2122                      due to how we set prev.type in bidi_resolve_explicit,
2123                      which see.  */
2124                   type = bidi_it->prev.type;
2125                 }
2126             }
2127           else if (bidi_it->sos == R2L)
2128             type = STRONG_R;
2129           else if (bidi_it->sos == L2R)
2130             type = STRONG_L;
2131           else /* shouldn't happen! */
2132             emacs_abort ();
2133         }
2134       if (type == WEAK_EN       /* W2 */
2135           && bidi_it->last_strong.type == STRONG_AL)
2136         type = WEAK_AN;
2137       else if (type == STRONG_AL) /* W3 */
2138         type = STRONG_R;
2139       else if ((type == WEAK_ES /* W4 */
2140                 && bidi_it->prev.type == WEAK_EN
2141                 && bidi_it->prev.orig_type == WEAK_EN)
2142                || (type == WEAK_CS
2143                    && ((bidi_it->prev.type == WEAK_EN
2144                         && bidi_it->prev.orig_type == WEAK_EN)
2145                        || bidi_it->prev.type == WEAK_AN)))
2146         {
2147           const unsigned char *s
2148             = (STRINGP (bidi_it->string.lstring)
2149                ? SDATA (bidi_it->string.lstring)
2150                : bidi_it->string.s);
2151
2152           next_char = (bidi_it->charpos + bidi_it->nchars >= eob
2153                        ? BIDI_EOB
2154                        : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
2155                                            s, bidi_it->string.unibyte));
2156           type_of_next = bidi_get_type (next_char, override);
2157
2158           if (type_of_next == WEAK_BN
2159               || bidi_explicit_dir_char (next_char))
2160             {
2161               bidi_copy_it (&saved_it, bidi_it);
2162               while (bidi_resolve_explicit (bidi_it) == new_level
2163                      && bidi_it->type == WEAK_BN)
2164                 type_of_next = bidi_it->type;
2165               bidi_copy_it (bidi_it, &saved_it);
2166             }
2167
2168           /* If the next character is EN, but the last strong-type
2169              character is AL, that next EN will be changed to AN when
2170              we process it in W2 above.  So in that case, this ES
2171              should not be changed into EN.  */
2172           if (type == WEAK_ES
2173               && type_of_next == WEAK_EN
2174               && bidi_it->last_strong.type != STRONG_AL)
2175             type = WEAK_EN;
2176           else if (type == WEAK_CS)
2177             {
2178               if (bidi_it->prev.type == WEAK_AN
2179                   && (type_of_next == WEAK_AN
2180                       /* If the next character is EN, but the last
2181                          strong-type character is AL, EN will be later
2182                          changed to AN when we process it in W2 above.
2183                          So in that case, this ES should not be
2184                          changed into EN.  */
2185                       || (type_of_next == WEAK_EN
2186                           && bidi_it->last_strong.type == STRONG_AL)))
2187                 type = WEAK_AN;
2188               else if (bidi_it->prev.type == WEAK_EN
2189                        && type_of_next == WEAK_EN
2190                        && bidi_it->last_strong.type != STRONG_AL)
2191                 type = WEAK_EN;
2192             }
2193         }
2194       else if (type == WEAK_ET  /* W5: ET with EN before or after it */
2195                || type == WEAK_BN)      /* W5/Retaining */
2196         {
2197           if (bidi_it->prev.type == WEAK_EN) /* ET/BN w/EN before it */
2198             type = WEAK_EN;
2199           else if (bidi_it->next_en_pos > bidi_it->charpos
2200                    && bidi_it->next_en_type != WEAK_BN)
2201             {
2202               if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
2203                 type = WEAK_EN;
2204             }
2205           else if (bidi_it->next_en_pos >=0)
2206             {
2207               /* We overstepped the last known position for ET
2208                  resolution but there could be other such characters
2209                  in this paragraph (when we are sure there are no more
2210                  such positions, we set next_en_pos to a negative
2211                  value).  Try to find the next position for ET
2212                  resolution.  */
2213               ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
2214               const unsigned char *s = (STRINGP (bidi_it->string.lstring)
2215                                         ? SDATA (bidi_it->string.lstring)
2216                                         : bidi_it->string.s);
2217
2218               if (bidi_it->nchars <= 0)
2219                 emacs_abort ();
2220               next_char
2221                 = (bidi_it->charpos + bidi_it->nchars >= eob
2222                    ? BIDI_EOB
2223                    : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
2224                                        bidi_it->string.unibyte));
2225               type_of_next = bidi_get_type (next_char, override);
2226
2227               if (type_of_next == WEAK_ET
2228                   || type_of_next == WEAK_BN
2229                   || bidi_explicit_dir_char (next_char))
2230                 {
2231                   bidi_copy_it (&saved_it, bidi_it);
2232                   while (bidi_resolve_explicit (bidi_it) == new_level
2233                          && (bidi_it->type == WEAK_BN
2234                              || bidi_it->type == WEAK_ET))
2235                     type_of_next = bidi_it->type;
2236                   if (type == WEAK_BN
2237                       && bidi_it->charpos == saved_it.charpos + saved_it.nchars)
2238                     {
2239                       /* If we entered the above loop with a BN that
2240                          changes the level, the type of next
2241                          character, which is in a different level, is
2242                          not relevant to resolving this series of ET
2243                          and BN.  */
2244                       en_pos = saved_it.charpos;
2245                       type_of_next = type;
2246                     }
2247                   else
2248                     en_pos = bidi_it->charpos;
2249                   bidi_copy_it (bidi_it, &saved_it);
2250                 }
2251               /* Remember this position, to speed up processing of the
2252                  next ETs.  */
2253               bidi_it->next_en_pos = en_pos;
2254               if (type_of_next == WEAK_EN)
2255                 {
2256                   /* If the last strong character is AL, the EN we've
2257                      found will become AN when we get to it (W2). */
2258                   if (bidi_it->last_strong.type == STRONG_AL)
2259                     type_of_next = WEAK_AN;
2260                   else if (type == WEAK_BN)
2261                     type = NEUTRAL_ON; /* W6/Retaining */
2262                   else
2263                     type = WEAK_EN;
2264                 }
2265               else if (type_of_next == NEUTRAL_B)
2266                 /* Record the fact that there are no more ENs from
2267                    here to the end of paragraph, to avoid entering the
2268                    loop above ever again in this paragraph.  */
2269                 bidi_it->next_en_pos = -1;
2270               /* Record the type of the character where we ended our search.  */
2271               bidi_it->next_en_type = type_of_next;
2272             }
2273         }
2274     }
2275
2276   if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
2277       || (type == WEAK_BN
2278           && (bidi_it->prev.type == WEAK_CS         /* W6/Retaining */
2279               || bidi_it->prev.type == WEAK_ES
2280               || bidi_it->prev.type == WEAK_ET)))
2281     type = NEUTRAL_ON;
2282
2283   /* Store the type we've got so far, before we clobber it with strong
2284      types in W7 and while resolving neutral types.  But leave alone
2285      the original types that were recorded above, because we will need
2286      them for the L1 clause.  */
2287   if (bidi_it->type_after_wn == UNKNOWN_BT)
2288     bidi_it->type_after_wn = type;
2289   bidi_check_type (bidi_it->type_after_wn);
2290
2291   if (type == WEAK_EN)  /* W7 */
2292     {
2293       if ((bidi_it->last_strong.type == STRONG_L)
2294           || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sos == L2R))
2295         type = STRONG_L;
2296     }
2297
2298   bidi_it->type = type;
2299   bidi_check_type (bidi_it->type);
2300   return type;
2301 }
2302
2303 /* Resolve the type of a neutral character according to the type of
2304    surrounding strong text and the current embedding level.  */
2305 static bidi_type_t
2306 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
2307 {
2308   /* N1: "European and Arabic numbers act as if they were R in terms
2309      of their influence on NIs."  */
2310   if (next_type == WEAK_EN || next_type == WEAK_AN)
2311     next_type = STRONG_R;
2312   if (prev_type == WEAK_EN || prev_type == WEAK_AN)
2313     prev_type = STRONG_R;
2314
2315   if (next_type == prev_type)   /* N1 */
2316     return next_type;
2317   else if ((lev & 1) == 0)      /* N2 */
2318     return STRONG_L;
2319   else
2320     return STRONG_R;
2321 }
2322
2323 #define FLAG_EMBEDDING_INSIDE  1
2324 #define FLAG_OPPOSITE_INSIDE   2
2325
2326 /* A data type used in the stack maintained by
2327    bidi_find_bracket_pairs below.  */
2328 typedef struct bpa_stack_entry {
2329   int close_bracket_char;
2330   int open_bracket_idx;
2331 #ifdef ENABLE_CHECKING
2332   ptrdiff_t open_bracket_pos;
2333 #endif
2334   unsigned flags : 2;
2335 } bpa_stack_entry;
2336
2337 /* With MAX_ALLOCA of 16KB, this should allow at least 1K slots in the
2338    BPA stack, which should be more than enough for actual bidi text.  */
2339 #define MAX_BPA_STACK (max (MAX_ALLOCA / sizeof (bpa_stack_entry), 1))
2340
2341 /* UAX#9 says to match opening brackets with the matching closing
2342    brackets or their canonical equivalents.  As of Unicode 7.0, there
2343    are only 2 bracket characters that have canonical equivalence
2344    decompositions: u+2329 and u+232A.  So instead of accessing the
2345    table in uni-decomposition.el, we just handle these 2 characters
2346    with this simple macro.  Note that ASCII characters don't have
2347    canonical equivalents by definition.  */
2348
2349 /* To find all the characters that need to be processed by
2350    CANONICAL_EQU, first find all the characters which have
2351    decompositions in UnicodeData.txt, with this Awk script:
2352
2353     awk -F ";" " {if ($6 != \"\") print $1, $6}" UnicodeData.txt
2354
2355    Then produce a list of all the bracket characters in BidiBrackets.txt:
2356
2357     awk -F "[ ;]" " {if ($1 != \"#\" && $1 != \"\") print $1}" BidiBrackets.txt
2358
2359    And finally, cross-reference these two:
2360
2361     fgrep -w -f brackets.txt decompositions.txt
2362
2363    where "decompositions.txt" was produced by the 1st script, and
2364    "brackets.txt" by the 2nd script.  In the output of fgrep, look
2365    only for decompositions that don't begin with some compatibility
2366    formatting tag, such as "<compat>".  Only decompositions that
2367    consist solely of character codepoints are relevant to bidi
2368    brackets processing.  */
2369
2370 #define CANONICAL_EQU(c)                                        \
2371   ( ASCII_CHAR_P (c) ? c                                        \
2372     : (c) == 0x2329 ? 0x3008                                    \
2373     : (c) == 0x232a ? 0x3009                                    \
2374     : c )
2375
2376 #ifdef ENABLE_CHECKING
2377 # define STORE_BRACKET_CHARPOS \
2378    bpa_stack[bpa_sp].open_bracket_pos = bidi_it->charpos
2379 #else
2380 # define STORE_BRACKET_CHARPOS  /* nothing */
2381 #endif
2382
2383 #define PUSH_BPA_STACK                                                  \
2384   do {                                                                  \
2385     int ch;                                                             \
2386     bpa_sp++;                                                           \
2387     if (bpa_sp >= MAX_BPA_STACK)                                        \
2388       {                                                                 \
2389         bpa_sp = MAX_BPA_STACK - 1;                                     \
2390         goto bpa_give_up;                                               \
2391       }                                                                 \
2392     ch = CANONICAL_EQU (bidi_it->ch);                                   \
2393     bpa_stack[bpa_sp].close_bracket_char = bidi_mirror_char (ch);       \
2394     bpa_stack[bpa_sp].open_bracket_idx = bidi_cache_last_idx;           \
2395     bpa_stack[bpa_sp].flags = 0;                                        \
2396     STORE_BRACKET_CHARPOS;                                              \
2397   } while (0)
2398
2399
2400 /* This function implements BPA, the Bidi Parenthesis Algorithm,
2401    described in BD16 and N0 of UAX#9.  It finds all the bracket pairs
2402    in the current isolating sequence, and records the enclosed type
2403    and the position of the matching bracket in the cache.  It returns
2404    non-zero if called with the iterator on the opening bracket which
2405    has a matching closing bracket in the current isolating sequence,
2406    zero otherwise.  */
2407 static bool
2408 bidi_find_bracket_pairs (struct bidi_it *bidi_it)
2409 {
2410   bidi_bracket_type_t btype;
2411   bidi_type_t type = bidi_it->type;
2412   bool retval = false;
2413
2414   /* When scanning backwards, we don't expect any unresolved bidi
2415      bracket characters.  */
2416   if (bidi_it->scan_dir != 1)
2417     emacs_abort ();
2418
2419   btype = bidi_paired_bracket_type (bidi_it->ch);
2420   if (btype == BIDI_BRACKET_OPEN)
2421     {
2422       bpa_stack_entry bpa_stack[MAX_BPA_STACK];
2423       int bpa_sp = -1;
2424       struct bidi_it saved_it;
2425       int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2426       bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2427       struct bidi_it tem_it;
2428
2429       eassert (MAX_BPA_STACK >= 100);
2430       bidi_copy_it (&saved_it, bidi_it);
2431       /* bidi_cache_iterator_state refuses to cache on backward scans,
2432          and bidi_cache_fetch_state doesn't bring scan_dir from the
2433          cache, so we must initialize this explicitly.  */
2434       tem_it.scan_dir = 1;
2435
2436       while (1)
2437         {
2438           int old_sidx, new_sidx;
2439           int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2440
2441           /* Mark every opening bracket character we've traversed by
2442              putting its own position into bracket_pairing_pos.  This
2443              is examined in bidi_resolve_brackets to distinguish
2444              brackets that were already resolved to stay NEUTRAL_ON,
2445              and those that were not yet processed by this function
2446              (because they were skipped when we skip higher embedding
2447              levels below).  */
2448           if (btype == BIDI_BRACKET_OPEN && bidi_it->bracket_pairing_pos == -1)
2449             bidi_it->bracket_pairing_pos = bidi_it->charpos;
2450           bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2451           if (btype == BIDI_BRACKET_OPEN)
2452             PUSH_BPA_STACK;
2453           else if (btype == BIDI_BRACKET_CLOSE)
2454             {
2455               int sp = bpa_sp;
2456               int curchar = CANONICAL_EQU (bidi_it->ch);
2457
2458               eassert (sp >= 0);
2459               while (sp >= 0 && bpa_stack[sp].close_bracket_char != curchar)
2460                 sp--;
2461               if (sp >= 0)
2462                 {
2463                   /* Update and cache the corresponding opening bracket.  */
2464                   bidi_cache_fetch_state (bpa_stack[sp].open_bracket_idx,
2465                                           &tem_it);
2466 #ifdef ENABLE_CHECKING
2467                   eassert (bpa_stack[sp].open_bracket_pos == tem_it.charpos);
2468 #endif
2469                   /* Determine the enclosed type for this bracket
2470                      pair's type resolution according to N0.  */
2471                   if (bpa_stack[sp].flags & FLAG_EMBEDDING_INSIDE)
2472                     tem_it.bracket_enclosed_type = embedding_type; /* N0b */
2473                   else if (bpa_stack[sp].flags & FLAG_OPPOSITE_INSIDE)
2474                     tem_it.bracket_enclosed_type                   /* N0c */
2475                       = (embedding_type == STRONG_L ? STRONG_R : STRONG_L);
2476                   else                                             /* N0d */
2477                     tem_it.bracket_enclosed_type = UNKNOWN_BT;
2478
2479                   /* Record the position of the matching closing
2480                      bracket, and update the cache.  */
2481                   tem_it.bracket_pairing_pos = bidi_it->charpos;
2482                   bidi_cache_iterator_state (&tem_it, 0, 1);
2483
2484                   /* Pop the BPA stack.  */
2485                   bpa_sp = sp - 1;
2486                 }
2487               if (bpa_sp < 0)
2488                 {
2489                   retval = true;
2490                   break;
2491                 }
2492             }
2493           else if (bidi_get_category (bidi_it->type_after_wn) != NEUTRAL)
2494             {
2495               unsigned flag = 0;
2496               int sp;
2497
2498               /* Whenever we see a strong type, update the flags of
2499                  all the slots on the stack.  */
2500               switch (bidi_it->type)
2501                 {
2502                 case STRONG_L:
2503                   flag = ((embedding_level & 1) == 0
2504                           ? FLAG_EMBEDDING_INSIDE
2505                           : FLAG_OPPOSITE_INSIDE);
2506                   break;
2507                 case STRONG_R:
2508                 case WEAK_EN:
2509                 case WEAK_AN:
2510                   flag = ((embedding_level & 1) == 1
2511                           ? FLAG_EMBEDDING_INSIDE
2512                           : FLAG_OPPOSITE_INSIDE);
2513                   break;
2514                 default:
2515                   break;
2516                 }
2517               if (flag)
2518                 {
2519                   for (sp = bpa_sp; sp >= 0; sp--)
2520                     bpa_stack[sp].flags |= flag;
2521                 }
2522             }
2523           old_sidx = bidi_it->stack_idx;
2524           type = bidi_resolve_weak (bidi_it);
2525           /* Skip level runs excluded from this isolating run sequence.  */
2526           new_sidx = bidi_it->stack_idx;
2527           if (bidi_it->level_stack[new_sidx].level > current_level
2528               && (bidi_it->level_stack[new_sidx].isolate_status
2529                   || (new_sidx > old_sidx + 1
2530                       && bidi_it->level_stack[new_sidx - 1].isolate_status)))
2531             {
2532               while (bidi_it->level_stack[bidi_it->stack_idx].level
2533                      > current_level)
2534                 {
2535                   bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2536                   type = bidi_resolve_weak (bidi_it);
2537                 }
2538             }
2539           if (type == NEUTRAL_B
2540               || (bidi_it->level_stack[bidi_it->stack_idx].level
2541                   != current_level))
2542             {
2543             bpa_give_up:
2544               /* We've marched all the way to the end of this
2545                  isolating run sequence, and didn't find matching
2546                  closing brackets for some opening brackets.  Leave
2547                  their type unchanged.  */
2548               break;
2549             }
2550           if (bidi_it->type_after_wn == NEUTRAL_ON) /* Unicode 8.0 correction */
2551             btype = bidi_paired_bracket_type (bidi_it->ch);
2552           else
2553             btype = BIDI_BRACKET_NONE;
2554         }
2555
2556       /* Restore bidi_it from the cache, which should have the bracket
2557          resolution members set as determined by the above loop.  */
2558       type = bidi_cache_find (saved_it.charpos, 0, bidi_it);
2559       eassert (type == NEUTRAL_ON);
2560     }
2561
2562   return retval;
2563 }
2564
2565 static void
2566 bidi_record_type_for_neutral (struct bidi_saved_info *info, int level,
2567                               bool nextp)
2568 {
2569   int idx;
2570
2571   for (idx = bidi_cache_last_idx + 1; idx < bidi_cache_idx; idx++)
2572     {
2573       int lev = bidi_cache[idx].level_stack[bidi_cache[idx].stack_idx].level;
2574
2575       if (lev <= level)
2576         {
2577           eassert (lev == level);
2578           if (nextp)
2579             bidi_cache[idx].next_for_neutral = *info;
2580           else
2581             bidi_cache[idx].prev_for_neutral = *info;
2582           break;
2583         }
2584     }
2585 }
2586
2587 static bidi_type_t
2588 bidi_resolve_brackets (struct bidi_it *bidi_it)
2589 {
2590   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2591   bool resolve_bracket = false;
2592   bidi_type_t type = UNKNOWN_BT;
2593   int ch;
2594   struct bidi_saved_info prev_for_neutral, next_for_neutral;
2595
2596   /* Record the prev_for_neutral type either from the previous
2597      character, if it was a strong or AN/EN, or from the
2598      prev_for_neutral information recorded previously.  */
2599   if (bidi_it->type == STRONG_L || bidi_it->type == STRONG_R
2600       || bidi_it->type == WEAK_AN || bidi_it->type == WEAK_EN)
2601     bidi_remember_char (&prev_for_neutral, bidi_it, 1);
2602   else
2603     prev_for_neutral = bidi_it->prev_for_neutral;
2604   /* Record the next_for_neutral type information.  */
2605   if (bidi_it->next_for_neutral.charpos > bidi_it->charpos)
2606     next_for_neutral = bidi_it->next_for_neutral;
2607   else
2608     next_for_neutral.charpos = -1;
2609   if (!bidi_it->first_elt)
2610     {
2611       type = bidi_cache_find (bidi_it->charpos + bidi_it->nchars, 0, bidi_it);
2612       ch = bidi_it->ch;
2613     }
2614   if (type == UNKNOWN_BT)
2615     {
2616       type = bidi_resolve_weak (bidi_it);
2617       if (type == NEUTRAL_ON && bidi_find_bracket_pairs (bidi_it))
2618         resolve_bracket = true;
2619     }
2620   else
2621     {
2622       eassert (bidi_it->resolved_level == -1);
2623       /* If the cached state shows an increase of embedding level due
2624          to an isolate initiator, we need to update the 1st cached
2625          state of the next run of the current isolating sequence with
2626          the prev_for_neutral and next_for_neutral information, so
2627          that it will be picked up when we advance to that next run.  */
2628       if (bidi_it->level_stack[bidi_it->stack_idx].level > prev_level
2629           && bidi_it->level_stack[bidi_it->stack_idx].isolate_status)
2630         {
2631           bidi_record_type_for_neutral (&prev_for_neutral, prev_level, 0);
2632           bidi_record_type_for_neutral (&next_for_neutral, prev_level, 1);
2633         }
2634       if (type == NEUTRAL_ON
2635           && bidi_paired_bracket_type (ch) == BIDI_BRACKET_OPEN)
2636         {
2637           if (bidi_it->bracket_pairing_pos > bidi_it->charpos)
2638             {
2639               /* A cached opening bracket that wasn't completely
2640                  resolved yet.  */
2641               resolve_bracket = true;
2642             }
2643           else if (bidi_it->bracket_pairing_pos == -1)
2644             {
2645               /* Higher levels were not BPA-resolved yet, even if
2646                  cached by bidi_find_bracket_pairs.  Force application
2647                  of BPA to the new level now.  */
2648               if (bidi_find_bracket_pairs (bidi_it))
2649                 resolve_bracket = true;
2650             }
2651         }
2652       /* Keep track of the prev_for_neutral and next_for_neutral
2653          types, needed for resolving brackets below and for resolving
2654          neutrals in bidi_resolve_neutral.  */
2655       if (bidi_it->level_stack[bidi_it->stack_idx].level == prev_level)
2656         {
2657           bidi_it->prev_for_neutral = prev_for_neutral;
2658           if (next_for_neutral.charpos > 0)
2659             bidi_it->next_for_neutral = next_for_neutral;
2660         }
2661     }
2662
2663   /* If needed, resolve the bracket type according to N0.  */
2664   if (resolve_bracket)
2665     {
2666       int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2667       bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2668
2669       eassert (bidi_it->prev_for_neutral.type != UNKNOWN_BT);
2670       eassert (bidi_it->bracket_pairing_pos > bidi_it->charpos);
2671       if (bidi_it->bracket_enclosed_type == embedding_type) /* N0b */
2672         type = embedding_type;
2673       else
2674         {
2675           switch (bidi_it->prev_for_neutral.type)
2676             {
2677             case STRONG_R:
2678             case WEAK_EN:
2679             case WEAK_AN:
2680               type =
2681                 (bidi_it->bracket_enclosed_type == STRONG_R) /* N0c */
2682                 ? STRONG_R                                   /* N0c1 */
2683                 : embedding_type;                            /* N0c2 */
2684               break;
2685             case STRONG_L:
2686               type =
2687                 (bidi_it->bracket_enclosed_type == STRONG_L) /* N0c */
2688                 ? STRONG_L                                   /* N0c1 */
2689                 : embedding_type;                            /* N0c2 */
2690               break;
2691             default:
2692               /* N0d: Do not set the type for that bracket pair.  */
2693               break;
2694             }
2695         }
2696       eassert (type == STRONG_L || type == STRONG_R || type == NEUTRAL_ON);
2697
2698       /* Update the type of the paired closing bracket to the same
2699          type as for the resolved opening bracket.  */
2700       if (type != NEUTRAL_ON)
2701         {
2702           ptrdiff_t idx = bidi_cache_search (bidi_it->bracket_pairing_pos,
2703                                              -1, 1);
2704
2705           if (idx < bidi_cache_start)
2706             emacs_abort ();
2707           bidi_cache[idx].type = type;
2708         }
2709     }
2710
2711   return type;
2712 }
2713
2714 static bidi_type_t
2715 bidi_resolve_neutral (struct bidi_it *bidi_it)
2716 {
2717   bidi_type_t type = bidi_resolve_brackets (bidi_it);
2718   int current_level;
2719   bool is_neutral;
2720
2721   eassert (type == STRONG_R
2722            || type == STRONG_L
2723            || type == WEAK_BN
2724            || type == WEAK_EN
2725            || type == WEAK_AN
2726            || type == NEUTRAL_B
2727            || type == NEUTRAL_S
2728            || type == NEUTRAL_WS
2729            || type == NEUTRAL_ON
2730            || type == LRI
2731            || type == RLI
2732            || type == PDI);
2733
2734   current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2735   eassert (current_level >= 0);
2736   is_neutral = bidi_get_category (type) == NEUTRAL;
2737
2738   if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
2739                             we are already at paragraph end.  */
2740        && (is_neutral || bidi_isolate_fmt_char (type)))
2741       /* N1-N2/Retaining */
2742       || (type == WEAK_BN && bidi_explicit_dir_char (bidi_it->ch)))
2743     {
2744       if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
2745         {
2746           /* Make sure the data for resolving neutrals we are
2747              about to use is valid.  */
2748           eassert (bidi_it->next_for_neutral.charpos > bidi_it->charpos
2749                    /* PDI defines an eos, so it's OK for it to
2750                       serve as its own next_for_neutral.  */
2751                    || (bidi_it->next_for_neutral.charpos == bidi_it->charpos
2752                        && bidi_it->type == PDI));
2753           type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2754                                          bidi_it->next_for_neutral.type,
2755                                          current_level);
2756         }
2757       /* The next two "else if" clauses are shortcuts for the
2758          important special case when we have a long sequence of
2759          neutral or WEAK_BN characters, such as whitespace or nulls or
2760          other control characters, on the base embedding level of the
2761          paragraph, and that sequence goes all the way to the end of
2762          the paragraph and follows a character whose resolved
2763          directionality is identical to the base embedding level.
2764          (This is what happens in a buffer with plain L2R text that
2765          happens to include long sequences of control characters.)  By
2766          virtue of N1, the result of examining this long sequence will
2767          always be either STRONG_L or STRONG_R, depending on the base
2768          embedding level.  So we use this fact directly instead of
2769          entering the expensive loop in the "else" clause.  */
2770       else if (current_level == 0
2771                && bidi_it->prev_for_neutral.type == STRONG_L
2772                && !bidi_explicit_dir_char (bidi_it->ch)
2773                && !bidi_isolate_fmt_char (type))
2774         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2775                                        STRONG_L, current_level);
2776       else if (/* current level is 1 */
2777                current_level == 1
2778                /* base embedding level is also 1 */
2779                && bidi_it->level_stack[0].level == 1
2780                /* previous character is one of those considered R for
2781                   the purposes of W5 */
2782                && (bidi_it->prev_for_neutral.type == STRONG_R
2783                    || bidi_it->prev_for_neutral.type == WEAK_EN
2784                    || bidi_it->prev_for_neutral.type == WEAK_AN)
2785                && !bidi_explicit_dir_char (bidi_it->ch)
2786                && !bidi_isolate_fmt_char (type))
2787         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2788                                        STRONG_R, current_level);
2789       else
2790         {
2791           /* Arrrgh!!  The UAX#9 algorithm is too deeply entrenched in
2792              the assumption of batch-style processing; see clauses W4,
2793              W5, and especially N1, which require to look far forward
2794              (as well as back) in the buffer/string.  May the fleas of
2795              a thousand camels infest the armpits of those who design
2796              supposedly general-purpose algorithms by looking at their
2797              own implementations, and fail to consider other possible
2798              implementations!  */
2799           struct bidi_it saved_it;
2800           bidi_type_t next_type;
2801           bool adjacent_to_neutrals = is_neutral;
2802
2803           bidi_copy_it (&saved_it, bidi_it);
2804           /* Scan the text forward until we find the first non-neutral
2805              character, and then use that to resolve the neutral we
2806              are dealing with now.  We also cache the scanned iterator
2807              states, to salvage some of the effort later.  */
2808           do {
2809             int old_sidx, new_sidx;
2810
2811             /* Paragraph separators have their levels fully resolved
2812                at this point, so cache them as resolved.  */
2813             bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2814             old_sidx = bidi_it->stack_idx;
2815             type = bidi_resolve_brackets (bidi_it);
2816             /* Skip level runs excluded from this isolating run sequence.  */
2817             new_sidx = bidi_it->stack_idx;
2818             if (bidi_it->level_stack[new_sidx].level > current_level
2819                 && (bidi_it->level_stack[new_sidx].isolate_status
2820                     /* This is for when we have an isolate initiator
2821                        immediately followed by an embedding or
2822                        override initiator, in which case we get the
2823                        level stack pushed twice by the single call to
2824                        bidi_resolve_weak above.  */
2825                     || (new_sidx > old_sidx + 1
2826                         && bidi_it->level_stack[new_sidx - 1].isolate_status)))
2827               {
2828                 while (bidi_it->level_stack[bidi_it->stack_idx].level
2829                        > current_level)
2830                   {
2831                     bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2832                     type = bidi_resolve_brackets (bidi_it);
2833                   }
2834               }
2835             if (!adjacent_to_neutrals
2836                 && (bidi_get_category (type) == NEUTRAL
2837                     || bidi_isolate_fmt_char (type)))
2838               adjacent_to_neutrals = true;
2839           } while (!(type == NEUTRAL_B
2840                      || (type != WEAK_BN
2841                          && bidi_get_category (type) != NEUTRAL
2842                          && !bidi_isolate_fmt_char (type))
2843                      /* This is all per level run, so stop when we
2844                         reach the end of this level run.  */
2845                      || (bidi_it->level_stack[bidi_it->stack_idx].level
2846                          != current_level)));
2847
2848           /* Record the character we stopped at.  */
2849           bidi_remember_char (&saved_it.next_for_neutral, bidi_it, 1);
2850
2851           if ((bidi_it->level_stack[bidi_it->stack_idx].level != current_level)
2852               || type == NEUTRAL_B)
2853             {
2854               /* Marched all the way to the end of this level run.  We
2855                  need to use the eos type, whose information is stored
2856                  by bidi_set_sos_type in the prev_for_neutral
2857                  member.  */
2858               if (adjacent_to_neutrals)
2859                 next_type = bidi_it->prev_for_neutral.type;
2860               else
2861                 {
2862                   /* This is a BN which does not adjoin neutrals.
2863                      Leave its type alone.  */
2864                   bidi_copy_it (bidi_it, &saved_it);
2865                   return bidi_it->type;
2866                 }
2867             }
2868           else
2869             {
2870               switch (type)
2871                 {
2872                 case STRONG_L:
2873                 case STRONG_R:
2874                 case STRONG_AL:
2875                   /* Actually, STRONG_AL cannot happen here, because
2876                      bidi_resolve_weak converts it to STRONG_R, per W3.  */
2877                   eassert (type != STRONG_AL);
2878                   next_type = type;
2879                   break;
2880                 case WEAK_EN:
2881                 case WEAK_AN:
2882                   /* N1: "European and Arabic numbers act as if they
2883                      were R in terms of their influence on NIs."  */
2884                   next_type = STRONG_R;
2885                   break;
2886                 default:
2887                   emacs_abort ();
2888                   break;
2889                 }
2890             }
2891           /* Resolve the type of all the NIs found during the above loop.  */
2892           type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
2893                                          next_type, current_level);
2894           /* Update next_for_neutral with the resolved type, so we
2895              could use it for all the other NIs up to the place where
2896              we exited the loop.  */
2897           saved_it.next_for_neutral.type = next_type;
2898           bidi_check_type (type);
2899           /* Update the character which caused us to enter the above loop.  */
2900           saved_it.type = type;
2901           bidi_check_type (next_type);
2902           bidi_copy_it (bidi_it, &saved_it);
2903         }
2904     }
2905   return type;
2906 }
2907
2908 /* Given an iterator state in BIDI_IT, advance one character position
2909    in the buffer/string to the next character (in the logical order),
2910    resolve the bidi type of that next character, and return that
2911    type.  */
2912 static bidi_type_t
2913 bidi_type_of_next_char (struct bidi_it *bidi_it)
2914 {
2915   bidi_type_t type;
2916
2917   /* This should always be called during a forward scan.  */
2918   if (bidi_it->scan_dir != 1)
2919     emacs_abort ();
2920
2921   type = bidi_resolve_neutral (bidi_it);
2922
2923   return type;
2924 }
2925
2926 /* Given an iterator state BIDI_IT, advance one character position in
2927    the buffer/string to the next character (in the current scan
2928    direction), resolve the embedding and implicit levels of that next
2929    character, and return the resulting level.  */
2930 static int
2931 bidi_level_of_next_char (struct bidi_it *bidi_it)
2932 {
2933   bidi_type_t type = UNKNOWN_BT;
2934   int level;
2935   ptrdiff_t next_char_pos = -2;
2936
2937   if (bidi_it->scan_dir == 1)
2938     {
2939       ptrdiff_t eob
2940         = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2941            ? bidi_it->string.schars : ZV);
2942
2943       /* There's no sense in trying to advance if we've already hit
2944          the end of text.  */
2945       if (bidi_it->charpos >= eob)
2946         {
2947           eassert (bidi_it->resolved_level >= 0);
2948           return bidi_it->resolved_level;
2949         }
2950     }
2951
2952   /* Perhaps the character we want is already cached s fully resolved.
2953      If it is, the call to bidi_cache_find below will return a type
2954      other than UNKNOWN_BT.  */
2955   if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
2956     {
2957       int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2958                  ? 0 : 1);
2959
2960       if (bidi_it->scan_dir > 0)
2961         {
2962           if (bidi_it->nchars <= 0)
2963             emacs_abort ();
2964           next_char_pos = bidi_it->charpos + bidi_it->nchars;
2965         }
2966       else if (bidi_it->charpos >= bob)
2967         /* Implementation note: we allow next_char_pos to be as low as
2968            0 for buffers or -1 for strings, and that is okay because
2969            that's the "position" of the sentinel iterator state we
2970            cached at the beginning of the iteration.  */
2971         next_char_pos = bidi_it->charpos - 1;
2972       if (next_char_pos >= bob - 1)
2973         type = bidi_cache_find (next_char_pos, 1, bidi_it);
2974       if (type != UNKNOWN_BT)
2975         {
2976           /* We asked the cache for fully resolved states.  */
2977           eassert (bidi_it->resolved_level >= 0);
2978           return bidi_it->resolved_level;
2979         }
2980     }
2981
2982   if (bidi_it->scan_dir == -1)
2983     /* If we are going backwards, the iterator state is already cached
2984        from previous scans, and should be fully resolved.  */
2985     emacs_abort ();
2986
2987   if (type == UNKNOWN_BT)
2988     type = bidi_type_of_next_char (bidi_it);
2989
2990   if (type == NEUTRAL_B)
2991     {
2992       eassert (bidi_it->resolved_level >= 0);
2993       return bidi_it->resolved_level;
2994     }
2995
2996   level = bidi_it->level_stack[bidi_it->stack_idx].level;
2997
2998   eassert ((type == STRONG_R
2999             || type == STRONG_L
3000             || type == WEAK_BN
3001             || type == WEAK_EN
3002             || type == WEAK_AN));
3003   bidi_it->type = type;
3004   bidi_check_type (bidi_it->type);
3005
3006   /* For L1 below, we need to know, for each WS character, whether
3007      it belongs to a sequence of WS characters preceding a newline
3008      or a TAB or a paragraph separator.  */
3009   if ((bidi_it->orig_type == NEUTRAL_WS
3010        || bidi_isolate_fmt_char (bidi_it->orig_type))
3011       && bidi_it->next_for_ws.charpos < bidi_it->charpos)
3012     {
3013       int ch;
3014       ptrdiff_t clen = bidi_it->ch_len;
3015       ptrdiff_t bpos = bidi_it->bytepos;
3016       ptrdiff_t cpos = bidi_it->charpos;
3017       ptrdiff_t disp_pos = bidi_it->disp_pos;
3018       ptrdiff_t nc = bidi_it->nchars;
3019       struct bidi_string_data bs = bidi_it->string;
3020       bidi_type_t chtype;
3021       bool fwp = bidi_it->frame_window_p;
3022       int dpp = bidi_it->disp_prop;
3023
3024       if (bidi_it->nchars <= 0)
3025         emacs_abort ();
3026       do {
3027         ch = bidi_fetch_char (cpos += nc, bpos += clen, &disp_pos, &dpp, &bs,
3028                               bidi_it->w, fwp, &clen, &nc);
3029         chtype = bidi_get_type (ch, NEUTRAL_DIR);
3030       } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
3031                || bidi_isolate_fmt_char (chtype)
3032                || bidi_explicit_dir_char (ch)); /* L1/Retaining */
3033       bidi_it->next_for_ws.type = chtype;
3034       bidi_check_type (bidi_it->next_for_ws.type);
3035       bidi_it->next_for_ws.charpos = cpos;
3036     }
3037
3038   /* Update the cache, but only if this state was already cached.  */
3039   bidi_cache_iterator_state (bidi_it, 1, 1);
3040
3041   /* Resolve implicit levels.  */
3042   if (bidi_it->orig_type == NEUTRAL_B /* L1 */
3043            || bidi_it->orig_type == NEUTRAL_S
3044            || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
3045            || (bidi_it->orig_type == NEUTRAL_WS
3046                && (bidi_it->next_for_ws.type == NEUTRAL_B
3047                    || bidi_it->next_for_ws.type == NEUTRAL_S)))
3048     level = bidi_it->level_stack[0].level;
3049   else if ((level & 1) == 0) /* I1 */
3050     {
3051       if (type == STRONG_R)
3052         level++;
3053       else if (type == WEAK_EN || type == WEAK_AN)
3054         level += 2;
3055     }
3056   else                  /* I2 */
3057     {
3058       if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
3059         level++;
3060     }
3061
3062   bidi_it->resolved_level = level;
3063   return level;
3064 }
3065
3066 /* Move to the other edge of a level given by LEVEL.  If END_FLAG,
3067    we are at the end of a level, and we need to prepare to
3068    resume the scan of the lower level.
3069
3070    If this level's other edge is cached, we simply jump to it, filling
3071    the iterator structure with the iterator state on the other edge.
3072    Otherwise, we walk the buffer or string until we come back to the
3073    same level as LEVEL.
3074
3075    Note: we are not talking here about a ``level run'' in the UAX#9
3076    sense of the term, but rather about a ``level'' which includes
3077    all the levels higher than it.  In other words, given the levels
3078    like this:
3079
3080          11111112222222333333334443343222222111111112223322111
3081                 A      B                    C
3082
3083    and assuming we are at point A scanning left to right, this
3084    function moves to point C, whereas the UAX#9 ``level 2 run'' ends
3085    at point B.  */
3086 static void
3087 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, bool end_flag)
3088 {
3089   int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
3090   ptrdiff_t idx;
3091
3092   /* Try the cache first.  */
3093   if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
3094       >= bidi_cache_start)
3095     bidi_cache_fetch_state (idx, bidi_it);
3096   else
3097     {
3098       int new_level;
3099
3100       /* If we are at end of level, its edges must be cached.  */
3101       if (end_flag)
3102         emacs_abort ();
3103
3104       bidi_cache_iterator_state (bidi_it, 1, 0);
3105       do {
3106         new_level = bidi_level_of_next_char (bidi_it);
3107         bidi_cache_iterator_state (bidi_it, 1, 0);
3108       } while (new_level >= level);
3109     }
3110 }
3111
3112 void
3113 bidi_move_to_visually_next (struct bidi_it *bidi_it)
3114 {
3115   int old_level, new_level, next_level;
3116   struct bidi_it sentinel;
3117   struct gcpro gcpro1;
3118
3119   if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
3120     emacs_abort ();
3121
3122   if (bidi_it->scan_dir == 0)
3123     {
3124       bidi_it->scan_dir = 1;    /* default to logical order */
3125     }
3126
3127   /* The code below can call eval, and thus cause GC.  If we are
3128      iterating a Lisp string, make sure it won't be GCed.  */
3129   if (STRINGP (bidi_it->string.lstring))
3130     GCPRO1 (bidi_it->string.lstring);
3131
3132   /* If we just passed a newline, initialize for the next line.  */
3133   if (!bidi_it->first_elt
3134       && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3135     bidi_line_init (bidi_it);
3136
3137   /* Prepare the sentinel iterator state, and cache it.  When we bump
3138      into it, scanning backwards, we'll know that the last non-base
3139      level is exhausted.  */
3140   if (bidi_cache_idx == bidi_cache_start)
3141     {
3142       bidi_copy_it (&sentinel, bidi_it);
3143       if (bidi_it->first_elt)
3144         {
3145           sentinel.charpos--;   /* cached charpos needs to be monotonic */
3146           sentinel.bytepos--;
3147           sentinel.ch = '\n';   /* doesn't matter, but why not? */
3148           sentinel.ch_len = 1;
3149           sentinel.nchars = 1;
3150         }
3151       bidi_cache_iterator_state (&sentinel, 1, 0);
3152     }
3153
3154   old_level = bidi_it->resolved_level;
3155   new_level = bidi_level_of_next_char (bidi_it);
3156
3157   /* Reordering of resolved levels (clause L2) is implemented by
3158      jumping to the other edge of the level and flipping direction of
3159      scanning the text whenever we find a level change.  */
3160   if (new_level != old_level)
3161     {
3162       bool ascending = new_level > old_level;
3163       int level_to_search = ascending ? old_level + 1 : old_level;
3164       int incr = ascending ? 1 : -1;
3165       int expected_next_level = old_level + incr;
3166
3167       /* Jump (or walk) to the other edge of this level.  */
3168       bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3169       /* Switch scan direction and peek at the next character in the
3170          new direction.  */
3171       bidi_it->scan_dir = -bidi_it->scan_dir;
3172
3173       /* The following loop handles the case where the resolved level
3174          jumps by more than one.  This is typical for numbers inside a
3175          run of text with left-to-right embedding direction, but can
3176          also happen in other situations.  In those cases the decision
3177          where to continue after a level change, and in what direction,
3178          is tricky.  For example, given a text like below:
3179
3180                   abcdefgh
3181                   11336622
3182
3183          (where the numbers below the text show the resolved levels),
3184          the result of reordering according to UAX#9 should be this:
3185
3186                   efdcghba
3187
3188          This is implemented by the loop below which flips direction
3189          and jumps to the other edge of the level each time it finds
3190          the new level not to be the expected one.  The expected level
3191          is always one more or one less than the previous one.  */
3192       next_level = bidi_peek_at_next_level (bidi_it);
3193       while (next_level != expected_next_level)
3194         {
3195           /* If next_level is -1, it means we have an unresolved level
3196              in the cache, which at this point should not happen.  If
3197              it does, we will infloop.  */
3198           eassert (next_level >= 0);
3199           /* If next_level is not consistent with incr, we might
3200              infloop.  */
3201           eassert (incr > 0
3202                    ? next_level > expected_next_level
3203                    : next_level < expected_next_level);
3204           expected_next_level += incr;
3205           level_to_search += incr;
3206           bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3207           bidi_it->scan_dir = -bidi_it->scan_dir;
3208           next_level = bidi_peek_at_next_level (bidi_it);
3209         }
3210
3211       /* Finally, deliver the next character in the new direction.  */
3212       next_level = bidi_level_of_next_char (bidi_it);
3213     }
3214
3215   /* Take note when we have just processed the newline that precedes
3216      the end of the paragraph.  The next time we are about to be
3217      called, set_iterator_to_next will automatically reinit the
3218      paragraph direction, if needed.  We do this at the newline before
3219      the paragraph separator, because the next character might not be
3220      the first character of the next paragraph, due to the bidi
3221      reordering, whereas we _must_ know the paragraph base direction
3222      _before_ we process the paragraph's text, since the base
3223      direction affects the reordering.  */
3224   if (bidi_it->scan_dir == 1
3225       && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3226     {
3227       /* The paragraph direction of the entire string, once
3228          determined, is in effect for the entire string.  Setting the
3229          separator limit to the end of the string prevents
3230          bidi_paragraph_init from being called automatically on this
3231          string.  */
3232       if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3233         bidi_it->separator_limit = bidi_it->string.schars;
3234       else if (bidi_it->bytepos < ZV_BYTE)
3235         {
3236           ptrdiff_t sep_len
3237             = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
3238                                      bidi_it->bytepos + bidi_it->ch_len);
3239           if (bidi_it->nchars <= 0)
3240             emacs_abort ();
3241           if (sep_len >= 0)
3242             {
3243               bidi_it->new_paragraph = 1;
3244               /* Record the buffer position of the last character of the
3245                  paragraph separator.  */
3246               bidi_it->separator_limit
3247                 = bidi_it->charpos + bidi_it->nchars + sep_len;
3248             }
3249         }
3250     }
3251
3252   if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
3253     {
3254       /* If we are at paragraph's base embedding level and beyond the
3255          last cached position, the cache's job is done and we can
3256          discard it.  */
3257       if (bidi_it->resolved_level == bidi_it->level_stack[0].level
3258           && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
3259                                  + bidi_cache[bidi_cache_idx - 1].nchars - 1))
3260         bidi_cache_reset ();
3261         /* But as long as we are caching during forward scan, we must
3262            cache each state, or else the cache integrity will be
3263            compromised: it assumes cached states correspond to buffer
3264            positions 1:1.  */
3265       else
3266         bidi_cache_iterator_state (bidi_it, 1, 0);
3267     }
3268
3269   eassert (bidi_it->resolved_level >= 0
3270            && bidi_it->resolved_level <= BIDI_MAXDEPTH + 2);
3271
3272   if (STRINGP (bidi_it->string.lstring))
3273     UNGCPRO;
3274 }
3275
3276 /* This is meant to be called from within the debugger, whenever you
3277    wish to examine the cache contents.  */
3278 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
3279 void
3280 bidi_dump_cached_states (void)
3281 {
3282   ptrdiff_t i;
3283   int ndigits = 1;
3284
3285   if (bidi_cache_idx == 0)
3286     {
3287       fprintf (stderr, "The cache is empty.\n");
3288       return;
3289     }
3290   fprintf (stderr, "Total of  %"pD"d state%s in cache:\n",
3291            bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
3292
3293   for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
3294     ndigits++;
3295   fputs ("ch  ", stderr);
3296   for (i = 0; i < bidi_cache_idx; i++)
3297     fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
3298   fputs ("\n", stderr);
3299   fputs ("lvl ", stderr);
3300   for (i = 0; i < bidi_cache_idx; i++)
3301     fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
3302   fputs ("\n", stderr);
3303   fputs ("pos ", stderr);
3304   for (i = 0; i < bidi_cache_idx; i++)
3305     fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
3306   fputs ("\n", stderr);
3307 }