code.delx.au - gnu-emacs/blob - src/bidi.c

   1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
   2    Copyright (C) 2000-2001, 2004-2005, 2009-2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 /* Written by Eli Zaretskii <eliz@gnu.org>.
  21
  22    A sequential implementation of the Unicode Bidirectional algorithm,
  23    (UBA) as per UAX#9, a part of the Unicode Standard.
  24
  25    Unlike the reference and most other implementations, this one is
  26    designed to be called once for every character in the buffer or
  27    string.
  28
  29    The main entry point is bidi_move_to_visually_next.  Each time it
  30    is called, it finds the next character in the visual order, and
  31    returns its information in a special structure.  The caller is then
  32    expected to process this character for display or any other
  33    purposes, and call bidi_move_to_visually_next for the next
  34    character.  See the comments in bidi_move_to_visually_next for more
  35    details about its algorithm that finds the next visual-order
  36    character by resolving their levels on the fly.
  37
  38    Two other entry points are bidi_paragraph_init and
  39    bidi_mirror_char.  The first determines the base direction of a
  40    paragraph, while the second returns the mirrored version of its
  41    argument character.
  42
  43    A few auxiliary entry points are used to initialize the bidi
  44    iterator for iterating an object (buffer or string), push and pop
  45    the bidi iterator state, and save and restore the state of the bidi
  46    cache.
  47
  48    If you want to understand the code, you will have to read it
  49    together with the relevant portions of UAX#9.  The comments include
  50    references to UAX#9 rules, for that very reason.
  51
  52    A note about references to UAX#9 rules: if the reference says
  53    something like "X9/Retaining", it means that you need to refer to
  54    rule X9 and to its modifications decribed in the "Implementation
  55    Notes" section of UAX#9, under "Retaining Format Codes".  */
  56
  57 #include <config.h>
  58 #include <stdio.h>
  59 #include <setjmp.h>
  60
  61 #include "lisp.h"
  62 #include "buffer.h"
  63 #include "character.h"
  64 #include "dispextern.h"
  65
  66 static int bidi_initialized = 0;
  67
  68 static Lisp_Object bidi_type_table, bidi_mirror_table;
  69
  70 #define LRM_CHAR   0x200E
  71 #define RLM_CHAR   0x200F
  72 #define BIDI_EOB   -1
  73
  74 /* Data type for describing the bidirectional character categories.  */
  75 typedef enum {
  76   UNKNOWN_BC,
  77   NEUTRAL,
  78   WEAK,
  79   STRONG
  80 } bidi_category_t;
  81
  82 /* UAX#9 says to search only for L, AL, or R types of characters, and
  83    ignore RLE, RLO, LRE, and LRO, when determining the base paragraph
  84    level.  Yudit indeed ignores them.  This variable is therefore set
  85    by default to ignore them, but setting it to zero will take them
  86    into account.  */
  87 extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
  88 int bidi_ignore_explicit_marks_for_paragraph_level = 1;
  89
  90 static Lisp_Object paragraph_start_re, paragraph_separate_re;
  91 static Lisp_Object Qparagraph_start, Qparagraph_separate;
  92
  93 \f
  94 /***********************************************************************
  95                         Utilities
  96  ***********************************************************************/
  97
  98 /* Return the bidi type of a character CH, subject to the current
  99    directional OVERRIDE.  */
 100 static inline bidi_type_t
 101 bidi_get_type (int ch, bidi_dir_t override)
 102 {
 103   bidi_type_t default_type;
 104
 105   if (ch == BIDI_EOB)
 106     return NEUTRAL_B;
 107   if (ch < 0 || ch > MAX_CHAR)
 108     abort ();
 109
 110   default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
 111
 112   if (override == NEUTRAL_DIR)
 113     return default_type;
 114
 115   switch (default_type)
 116     {
 117       /* Although UAX#9 does not tell, it doesn't make sense to
 118          override NEUTRAL_B and LRM/RLM characters.  */
 119       case NEUTRAL_B:
 120       case LRE:
 121       case LRO:
 122       case RLE:
 123       case RLO:
 124       case PDF:
 125         return default_type;
 126       default:
 127         switch (ch)
 128           {
 129             case LRM_CHAR:
 130             case RLM_CHAR:
 131               return default_type;
 132             default:
 133               if (override == L2R) /* X6 */
 134                 return STRONG_L;
 135               else if (override == R2L)
 136                 return STRONG_R;
 137               else
 138                 abort ();       /* can't happen: handled above */
 139           }
 140     }
 141 }
 142
 143 static void
 144 bidi_check_type (bidi_type_t type)
 145 {
 146   if (type < UNKNOWN_BT || type > NEUTRAL_ON)
 147     abort ();
 148 }
 149
 150 /* Given a bidi TYPE of a character, return its category.  */
 151 static inline bidi_category_t
 152 bidi_get_category (bidi_type_t type)
 153 {
 154   switch (type)
 155     {
 156       case UNKNOWN_BT:
 157         return UNKNOWN_BC;
 158       case STRONG_L:
 159       case STRONG_R:
 160       case STRONG_AL:
 161       case LRE:
 162       case LRO:
 163       case RLE:
 164       case RLO:
 165         return STRONG;
 166       case PDF:         /* ??? really?? */
 167       case WEAK_EN:
 168       case WEAK_ES:
 169       case WEAK_ET:
 170       case WEAK_AN:
 171       case WEAK_CS:
 172       case WEAK_NSM:
 173       case WEAK_BN:
 174         return WEAK;
 175       case NEUTRAL_B:
 176       case NEUTRAL_S:
 177       case NEUTRAL_WS:
 178       case NEUTRAL_ON:
 179         return NEUTRAL;
 180       default:
 181         abort ();
 182     }
 183 }
 184
 185 /* Return the mirrored character of C, if it has one.  If C has no
 186    mirrored counterpart, return C.
 187    Note: The conditions in UAX#9 clause L4 regarding the surrounding
 188    context must be tested by the caller.  */
 189 int
 190 bidi_mirror_char (int c)
 191 {
 192   Lisp_Object val;
 193
 194   if (c == BIDI_EOB)
 195     return c;
 196   if (c < 0 || c > MAX_CHAR)
 197     abort ();
 198
 199   val = CHAR_TABLE_REF (bidi_mirror_table, c);
 200   if (INTEGERP (val))
 201     {
 202       int v = XINT (val);
 203
 204       if (v < 0 || v > MAX_CHAR)
 205         abort ();
 206
 207       return v;
 208     }
 209
 210   return c;
 211 }
 212
 213 /* Determine the start-of-run (sor) directional type given the two
 214    embedding levels on either side of the run boundary.  Also, update
 215    the saved info about previously seen characters, since that info is
 216    generally valid for a single level run.  */
 217 static inline void
 218 bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
 219 {
 220   int higher_level = level_before > level_after ? level_before : level_after;
 221
 222   /* The prev_was_pdf gork is required for when we have several PDFs
 223      in a row.  In that case, we want to compute the sor type for the
 224      next level run only once: when we see the first PDF.  That's
 225      because the sor type depends only on the higher of the two levels
 226      that we find on the two sides of the level boundary (see UAX#9,
 227      clause X10), and so we don't need to know the final embedding
 228      level to which we descend after processing all the PDFs.  */
 229   if (!bidi_it->prev_was_pdf || level_before < level_after)
 230     /* FIXME: should the default sor direction be user selectable?  */
 231     bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
 232   if (level_before > level_after)
 233     bidi_it->prev_was_pdf = 1;
 234
 235   bidi_it->prev.type = UNKNOWN_BT;
 236   bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
 237     bidi_it->last_strong.orig_type = UNKNOWN_BT;
 238   bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
 239   bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
 240   bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
 241   bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
 242     bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 243   bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
 244 }
 245
 246 /* Push the current embedding level and override status; reset the
 247    current level to LEVEL and the current override status to OVERRIDE.  */
 248 static inline void
 249 bidi_push_embedding_level (struct bidi_it *bidi_it,
 250                            int level, bidi_dir_t override)
 251 {
 252   bidi_it->stack_idx++;
 253   xassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
 254   bidi_it->level_stack[bidi_it->stack_idx].level = level;
 255   bidi_it->level_stack[bidi_it->stack_idx].override = override;
 256 }
 257
 258 /* Pop the embedding level and directional override status from the
 259    stack, and return the new level.  */
 260 static inline int
 261 bidi_pop_embedding_level (struct bidi_it *bidi_it)
 262 {
 263   /* UAX#9 says to ignore invalid PDFs.  */
 264   if (bidi_it->stack_idx > 0)
 265     bidi_it->stack_idx--;
 266   return bidi_it->level_stack[bidi_it->stack_idx].level;
 267 }
 268
 269 /* Record in SAVED_INFO the information about the current character.  */
 270 static inline void
 271 bidi_remember_char (struct bidi_saved_info *saved_info,
 272                     struct bidi_it *bidi_it)
 273 {
 274   saved_info->charpos = bidi_it->charpos;
 275   saved_info->bytepos = bidi_it->bytepos;
 276   saved_info->type = bidi_it->type;
 277   bidi_check_type (bidi_it->type);
 278   saved_info->type_after_w1 = bidi_it->type_after_w1;
 279   bidi_check_type (bidi_it->type_after_w1);
 280   saved_info->orig_type = bidi_it->orig_type;
 281   bidi_check_type (bidi_it->orig_type);
 282 }
 283
 284 /* Copy the bidi iterator from FROM to TO.  To save cycles, this only
 285    copies the part of the level stack that is actually in use.  */
 286 static inline void
 287 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
 288 {
 289   int i;
 290
 291   /* Copy everything except the level stack and beyond.  */
 292   memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
 293
 294   /* Copy the active part of the level stack.  */
 295   to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
 296   for (i = 1; i <= from->stack_idx; i++)
 297     to->level_stack[i] = from->level_stack[i];
 298 }
 299
 300 \f
 301 /***********************************************************************
 302                         Caching the bidi iterator states
 303  ***********************************************************************/
 304
 305 #define BIDI_CACHE_CHUNK 200
 306 static struct bidi_it *bidi_cache;
 307 static ptrdiff_t bidi_cache_size = 0;
 308 enum { elsz = sizeof (struct bidi_it) };
 309 static ptrdiff_t bidi_cache_idx;        /* next unused cache slot */
 310 static ptrdiff_t bidi_cache_last_idx;   /* slot of last cache hit */
 311 static ptrdiff_t bidi_cache_start = 0;  /* start of cache for this
 312                                            "stack" level */
 313
 314 /* 5-slot stack for saving the start of the previous level of the
 315    cache.  xdisp.c maintains a 5-slot stack for its iterator state,
 316    and we need the same size of our stack.  */
 317 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
 318 static int bidi_cache_sp;
 319
 320 /* Size of header used by bidi_shelve_cache.  */
 321 enum
 322   {
 323     bidi_shelve_header_size =
 324       (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
 325        + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
 326        + sizeof (bidi_cache_last_idx))
 327   };
 328
 329 /* Reset the cache state to the empty state.  We only reset the part
 330    of the cache relevant to iteration of the current object.  Previous
 331    objects, which are pushed on the display iterator's stack, are left
 332    intact.  This is called when the cached information is no more
 333    useful for the current iteration, e.g. when we were reseated to a
 334    new position on the same object.  */
 335 static inline void
 336 bidi_cache_reset (void)
 337 {
 338   bidi_cache_idx = bidi_cache_start;
 339   bidi_cache_last_idx = -1;
 340 }
 341
 342 /* Shrink the cache to its minimal size.  Called when we init the bidi
 343    iterator for reordering a buffer or a string that does not come
 344    from display properties, because that means all the previously
 345    cached info is of no further use.  */
 346 static inline void
 347 bidi_cache_shrink (void)
 348 {
 349   if (bidi_cache_size > BIDI_CACHE_CHUNK)
 350     {
 351       bidi_cache_size = BIDI_CACHE_CHUNK;
 352       bidi_cache =
 353         (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
 354     }
 355   bidi_cache_reset ();
 356 }
 357
 358 static inline void
 359 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
 360 {
 361   int current_scan_dir = bidi_it->scan_dir;
 362
 363   if (idx < bidi_cache_start || idx >= bidi_cache_idx)
 364     abort ();
 365
 366   bidi_copy_it (bidi_it, &bidi_cache[idx]);
 367   bidi_it->scan_dir = current_scan_dir;
 368   bidi_cache_last_idx = idx;
 369 }
 370
 371 /* Find a cached state with a given CHARPOS and resolved embedding
 372    level less or equal to LEVEL.  if LEVEL is -1, disregard the
 373    resolved levels in cached states.  DIR, if non-zero, means search
 374    in that direction from the last cache hit.  */
 375 static inline ptrdiff_t
 376 bidi_cache_search (EMACS_INT charpos, int level, int dir)
 377 {
 378   ptrdiff_t i, i_start;
 379
 380   if (bidi_cache_idx > bidi_cache_start)
 381     {
 382       if (bidi_cache_last_idx == -1)
 383         bidi_cache_last_idx = bidi_cache_idx - 1;
 384       if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
 385         {
 386           dir = -1;
 387           i_start = bidi_cache_last_idx - 1;
 388         }
 389       else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
 390                           + bidi_cache[bidi_cache_last_idx].nchars - 1))
 391         {
 392           dir = 1;
 393           i_start = bidi_cache_last_idx + 1;
 394         }
 395       else if (dir)
 396         i_start = bidi_cache_last_idx;
 397       else
 398         {
 399           dir = -1;
 400           i_start = bidi_cache_idx - 1;
 401         }
 402
 403       if (dir < 0)
 404         {
 405           /* Linear search for now; FIXME!  */
 406           for (i = i_start; i >= bidi_cache_start; i--)
 407             if (bidi_cache[i].charpos <= charpos
 408                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 409                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 410               return i;
 411         }
 412       else
 413         {
 414           for (i = i_start; i < bidi_cache_idx; i++)
 415             if (bidi_cache[i].charpos <= charpos
 416                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 417                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 418               return i;
 419         }
 420     }
 421
 422   return -1;
 423 }
 424
 425 /* Find a cached state where the resolved level changes to a value
 426    that is lower than LEVEL, and return its cache slot index.  DIR is
 427    the direction to search, starting with the last used cache slot.
 428    If DIR is zero, we search backwards from the last occupied cache
 429    slot.  BEFORE, if non-zero, means return the index of the slot that
 430    is ``before'' the level change in the search direction.  That is,
 431    given the cached levels like this:
 432
 433          1122333442211
 434           AB        C
 435
 436    and assuming we are at the position cached at the slot marked with
 437    C, searching backwards (DIR = -1) for LEVEL = 2 will return the
 438    index of slot B or A, depending whether BEFORE is, respectively,
 439    non-zero or zero.  */
 440 static ptrdiff_t
 441 bidi_cache_find_level_change (int level, int dir, int before)
 442 {
 443   if (bidi_cache_idx)
 444     {
 445       ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
 446       int incr = before ? 1 : 0;
 447
 448       xassert (!dir || bidi_cache_last_idx >= 0);
 449
 450       if (!dir)
 451         dir = -1;
 452       else if (!incr)
 453         i += dir;
 454
 455       if (dir < 0)
 456         {
 457           while (i >= bidi_cache_start + incr)
 458             {
 459               if (bidi_cache[i - incr].resolved_level >= 0
 460                   && bidi_cache[i - incr].resolved_level < level)
 461                 return i;
 462               i--;
 463             }
 464         }
 465       else
 466         {
 467           while (i < bidi_cache_idx - incr)
 468             {
 469               if (bidi_cache[i + incr].resolved_level >= 0
 470                   && bidi_cache[i + incr].resolved_level < level)
 471                 return i;
 472               i++;
 473             }
 474         }
 475     }
 476
 477   return -1;
 478 }
 479
 480 static inline void
 481 bidi_cache_ensure_space (ptrdiff_t idx)
 482 {
 483   /* Enlarge the cache as needed.  */
 484   if (idx >= bidi_cache_size)
 485     {
 486       /* The bidi cache cannot be larger than the largest Lisp string
 487          or buffer.  */
 488       ptrdiff_t string_or_buffer_bound =
 489         max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
 490
 491       /* Also, it cannot be larger than what C can represent.  */
 492       ptrdiff_t c_bound =
 493         (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
 494
 495       bidi_cache =
 496         xpalloc (bidi_cache, &bidi_cache_size,
 497                  max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
 498                  min (string_or_buffer_bound, c_bound), elsz);
 499     }
 500 }
 501
 502 static inline void
 503 bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
 504 {
 505   ptrdiff_t idx;
 506
 507   /* We should never cache on backward scans.  */
 508   if (bidi_it->scan_dir == -1)
 509     abort ();
 510   idx = bidi_cache_search (bidi_it->charpos, -1, 1);
 511
 512   if (idx < 0)
 513     {
 514       idx = bidi_cache_idx;
 515       bidi_cache_ensure_space (idx);
 516       /* Character positions should correspond to cache positions 1:1.
 517          If we are outside the range of cached positions, the cache is
 518          useless and must be reset.  */
 519       if (idx > bidi_cache_start &&
 520           (bidi_it->charpos > (bidi_cache[idx - 1].charpos
 521                                + bidi_cache[idx - 1].nchars)
 522            || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
 523         {
 524           bidi_cache_reset ();
 525           idx = bidi_cache_start;
 526         }
 527       if (bidi_it->nchars <= 0)
 528         abort ();
 529       bidi_copy_it (&bidi_cache[idx], bidi_it);
 530       if (!resolved)
 531         bidi_cache[idx].resolved_level = -1;
 532     }
 533   else
 534     {
 535       /* Copy only the members which could have changed, to avoid
 536          costly copying of the entire struct.  */
 537       bidi_cache[idx].type = bidi_it->type;
 538       bidi_check_type (bidi_it->type);
 539       bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
 540       bidi_check_type (bidi_it->type_after_w1);
 541       if (resolved)
 542         bidi_cache[idx].resolved_level = bidi_it->resolved_level;
 543       else
 544         bidi_cache[idx].resolved_level = -1;
 545       bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
 546       bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
 547       bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
 548       bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
 549       bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
 550     }
 551
 552   bidi_cache_last_idx = idx;
 553   if (idx >= bidi_cache_idx)
 554     bidi_cache_idx = idx + 1;
 555 }
 556
 557 static inline bidi_type_t
 558 bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
 559 {
 560   ptrdiff_t i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
 561
 562   if (i >= bidi_cache_start)
 563     {
 564       bidi_dir_t current_scan_dir = bidi_it->scan_dir;
 565
 566       bidi_copy_it (bidi_it, &bidi_cache[i]);
 567       bidi_cache_last_idx = i;
 568       /* Don't let scan direction from from the cached state override
 569          the current scan direction.  */
 570       bidi_it->scan_dir = current_scan_dir;
 571       return bidi_it->type;
 572     }
 573
 574   return UNKNOWN_BT;
 575 }
 576
 577 static inline int
 578 bidi_peek_at_next_level (struct bidi_it *bidi_it)
 579 {
 580   if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
 581     abort ();
 582   return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
 583 }
 584
 585 \f
 586 /***********************************************************************
 587              Pushing and popping the bidi iterator state
 588  ***********************************************************************/
 589
 590 /* Push the bidi iterator state in preparation for reordering a
 591    different object, e.g. display string found at certain buffer
 592    position.  Pushing the bidi iterator boils down to saving its
 593    entire state on the cache and starting a new cache "stacked" on top
 594    of the current cache.  */
 595 void
 596 bidi_push_it (struct bidi_it *bidi_it)
 597 {
 598   /* Save the current iterator state in its entirety after the last
 599      used cache slot.  */
 600   bidi_cache_ensure_space (bidi_cache_idx);
 601   memcpy (&bidi_cache[bidi_cache_idx++], bidi_it, sizeof (struct bidi_it));
 602
 603   /* Push the current cache start onto the stack.  */
 604   xassert (bidi_cache_sp < IT_STACK_SIZE);
 605   bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
 606
 607   /* Start a new level of cache, and make it empty.  */
 608   bidi_cache_start = bidi_cache_idx;
 609   bidi_cache_last_idx = -1;
 610 }
 611
 612 /* Restore the iterator state saved by bidi_push_it and return the
 613    cache to the corresponding state.  */
 614 void
 615 bidi_pop_it (struct bidi_it *bidi_it)
 616 {
 617   if (bidi_cache_start <= 0)
 618     abort ();
 619
 620   /* Reset the next free cache slot index to what it was before the
 621      call to bidi_push_it.  */
 622   bidi_cache_idx = bidi_cache_start - 1;
 623
 624   /* Restore the bidi iterator state saved in the cache.  */
 625   memcpy (bidi_it, &bidi_cache[bidi_cache_idx], sizeof (struct bidi_it));
 626
 627   /* Pop the previous cache start from the stack.  */
 628   if (bidi_cache_sp <= 0)
 629     abort ();
 630   bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
 631
 632   /* Invalidate the last-used cache slot data.  */
 633   bidi_cache_last_idx = -1;
 634 }
 635
 636 static ptrdiff_t bidi_cache_total_alloc;
 637
 638 /* Stash away a copy of the cache and its control variables.  */
 639 void *
 640 bidi_shelve_cache (void)
 641 {
 642   unsigned char *databuf;
 643   ptrdiff_t alloc;
 644
 645   /* Empty cache.  */
 646   if (bidi_cache_idx == 0)
 647     return NULL;
 648
 649   alloc = (bidi_shelve_header_size
 650            + bidi_cache_idx * sizeof (struct bidi_it));
 651   databuf = xmalloc (alloc);
 652   bidi_cache_total_alloc += alloc;
 653
 654   memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
 655   memcpy (databuf + sizeof (bidi_cache_idx),
 656           bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
 657   memcpy (databuf + sizeof (bidi_cache_idx)
 658           + bidi_cache_idx * sizeof (struct bidi_it),
 659           bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
 660   memcpy (databuf + sizeof (bidi_cache_idx)
 661           + bidi_cache_idx * sizeof (struct bidi_it)
 662           + sizeof (bidi_cache_start_stack),
 663           &bidi_cache_sp, sizeof (bidi_cache_sp));
 664   memcpy (databuf + sizeof (bidi_cache_idx)
 665           + bidi_cache_idx * sizeof (struct bidi_it)
 666           + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
 667           &bidi_cache_start, sizeof (bidi_cache_start));
 668   memcpy (databuf + sizeof (bidi_cache_idx)
 669           + bidi_cache_idx * sizeof (struct bidi_it)
 670           + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
 671           + sizeof (bidi_cache_start),
 672           &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
 673
 674   return databuf;
 675 }
 676
 677 /* Restore the cache state from a copy stashed away by
 678    bidi_shelve_cache, and free the buffer used to stash that copy.
 679    JUST_FREE non-zero means free the buffer, but don't restore the
 680    cache; used when the corresponding iterator is discarded instead of
 681    being restored.  */
 682 void
 683 bidi_unshelve_cache (void *databuf, int just_free)
 684 {
 685   unsigned char *p = databuf;
 686
 687   if (!p)
 688     {
 689       if (!just_free)
 690         {
 691           /* A NULL pointer means an empty cache.  */
 692           bidi_cache_start = 0;
 693           bidi_cache_sp = 0;
 694           bidi_cache_reset ();
 695         }
 696     }
 697   else
 698     {
 699       if (just_free)
 700         {
 701           ptrdiff_t idx;
 702
 703           memcpy (&idx, p, sizeof (bidi_cache_idx));
 704           bidi_cache_total_alloc -=
 705             bidi_shelve_header_size + idx * sizeof (struct bidi_it);
 706         }
 707       else
 708         {
 709           memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
 710           bidi_cache_ensure_space (bidi_cache_idx);
 711           memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
 712                   bidi_cache_idx * sizeof (struct bidi_it));
 713           memcpy (bidi_cache_start_stack,
 714                   p + sizeof (bidi_cache_idx)
 715                   + bidi_cache_idx * sizeof (struct bidi_it),
 716                   sizeof (bidi_cache_start_stack));
 717           memcpy (&bidi_cache_sp,
 718                   p + sizeof (bidi_cache_idx)
 719                   + bidi_cache_idx * sizeof (struct bidi_it)
 720                   + sizeof (bidi_cache_start_stack),
 721                   sizeof (bidi_cache_sp));
 722           memcpy (&bidi_cache_start,
 723                   p + sizeof (bidi_cache_idx)
 724                   + bidi_cache_idx * sizeof (struct bidi_it)
 725                   + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
 726                   sizeof (bidi_cache_start));
 727           memcpy (&bidi_cache_last_idx,
 728                   p + sizeof (bidi_cache_idx)
 729                   + bidi_cache_idx * sizeof (struct bidi_it)
 730                   + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
 731                   + sizeof (bidi_cache_start),
 732                   sizeof (bidi_cache_last_idx));
 733           bidi_cache_total_alloc -=
 734             bidi_shelve_header_size + bidi_cache_idx * sizeof (struct bidi_it);
 735         }
 736
 737       xfree (p);
 738     }
 739 }
 740
 741 \f
 742 /***********************************************************************
 743                         Initialization
 744  ***********************************************************************/
 745 static void
 746 bidi_initialize (void)
 747 {
 748   bidi_type_table = uniprop_table (intern ("bidi-class"));
 749   if (NILP (bidi_type_table))
 750     abort ();
 751   staticpro (&bidi_type_table);
 752
 753   bidi_mirror_table = uniprop_table (intern ("mirroring"));
 754   if (NILP (bidi_mirror_table))
 755     abort ();
 756   staticpro (&bidi_mirror_table);
 757
 758   Qparagraph_start = intern ("paragraph-start");
 759   staticpro (&Qparagraph_start);
 760   paragraph_start_re = Fsymbol_value (Qparagraph_start);
 761   if (!STRINGP (paragraph_start_re))
 762     paragraph_start_re = build_string ("\f\\|[ \t]*$");
 763   staticpro (&paragraph_start_re);
 764   Qparagraph_separate = intern ("paragraph-separate");
 765   staticpro (&Qparagraph_separate);
 766   paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
 767   if (!STRINGP (paragraph_separate_re))
 768     paragraph_separate_re = build_string ("[ \t\f]*$");
 769   staticpro (&paragraph_separate_re);
 770
 771   bidi_cache_sp = 0;
 772   bidi_cache_total_alloc = 0;
 773
 774   bidi_initialized = 1;
 775 }
 776
 777 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
 778    end.  */
 779 static inline void
 780 bidi_set_paragraph_end (struct bidi_it *bidi_it)
 781 {
 782   bidi_it->invalid_levels = 0;
 783   bidi_it->invalid_rl_levels = -1;
 784   bidi_it->stack_idx = 0;
 785   bidi_it->resolved_level = bidi_it->level_stack[0].level;
 786 }
 787
 788 /* Initialize the bidi iterator from buffer/string position CHARPOS.  */
 789 void
 790 bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p,
 791               struct bidi_it *bidi_it)
 792 {
 793   if (! bidi_initialized)
 794     bidi_initialize ();
 795   if (charpos >= 0)
 796     bidi_it->charpos = charpos;
 797   if (bytepos >= 0)
 798     bidi_it->bytepos = bytepos;
 799   bidi_it->frame_window_p = frame_window_p;
 800   bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
 801   bidi_it->first_elt = 1;
 802   bidi_set_paragraph_end (bidi_it);
 803   bidi_it->new_paragraph = 1;
 804   bidi_it->separator_limit = -1;
 805   bidi_it->type = NEUTRAL_B;
 806   bidi_it->type_after_w1 = NEUTRAL_B;
 807   bidi_it->orig_type = NEUTRAL_B;
 808   bidi_it->prev_was_pdf = 0;
 809   bidi_it->prev.type = bidi_it->prev.type_after_w1 =
 810     bidi_it->prev.orig_type = UNKNOWN_BT;
 811   bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
 812     bidi_it->last_strong.orig_type = UNKNOWN_BT;
 813   bidi_it->next_for_neutral.charpos = -1;
 814   bidi_it->next_for_neutral.type =
 815     bidi_it->next_for_neutral.type_after_w1 =
 816     bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 817   bidi_it->prev_for_neutral.charpos = -1;
 818   bidi_it->prev_for_neutral.type =
 819     bidi_it->prev_for_neutral.type_after_w1 =
 820     bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
 821   bidi_it->sor = L2R;    /* FIXME: should it be user-selectable? */
 822   bidi_it->disp_pos = -1;       /* invalid/unknown */
 823   bidi_it->disp_prop_p = 0;
 824   /* We can only shrink the cache if we are at the bottom level of its
 825      "stack".  */
 826   if (bidi_cache_start == 0)
 827     bidi_cache_shrink ();
 828   else
 829     bidi_cache_reset ();
 830 }
 831
 832 /* Perform initializations for reordering a new line of bidi text.  */
 833 static void
 834 bidi_line_init (struct bidi_it *bidi_it)
 835 {
 836   bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
 837   bidi_it->resolved_level = bidi_it->level_stack[0].level;
 838   bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
 839   bidi_it->invalid_levels = 0;
 840   bidi_it->invalid_rl_levels = -1;
 841   bidi_it->next_en_pos = -1;
 842   bidi_it->next_for_ws.type = UNKNOWN_BT;
 843   bidi_set_sor_type (bidi_it,
 844                      bidi_it->paragraph_dir == R2L ? 1 : 0,
 845                      bidi_it->level_stack[0].level); /* X10 */
 846
 847   bidi_cache_reset ();
 848 }
 849
 850 \f
 851 /***********************************************************************
 852                         Fetching characters
 853  ***********************************************************************/
 854
 855 /* Count bytes in string S between BEG/BEGBYTE and END.  BEG and END
 856    are zero-based character positions in S, BEGBYTE is byte position
 857    corresponding to BEG.  UNIBYTE, if non-zero, means S is a unibyte
 858    string.  */
 859 static inline EMACS_INT
 860 bidi_count_bytes (const unsigned char *s, const EMACS_INT beg,
 861                   const EMACS_INT begbyte, const EMACS_INT end, int unibyte)
 862 {
 863   EMACS_INT pos = beg;
 864   const unsigned char *p = s + begbyte, *start = p;
 865
 866   if (unibyte)
 867     p = s + end;
 868   else
 869     {
 870       if (!CHAR_HEAD_P (*p))
 871         abort ();
 872
 873       while (pos < end)
 874         {
 875           p += BYTES_BY_CHAR_HEAD (*p);
 876           pos++;
 877         }
 878     }
 879
 880   return p - start;
 881 }
 882
 883 /* Fetch and returns the character at byte position BYTEPOS.  If S is
 884    non-NULL, fetch the character from string S; otherwise fetch the
 885    character from the current buffer.  UNIBYTE non-zero means S is a
 886    unibyte string.  */
 887 static inline int
 888 bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s, int unibyte)
 889 {
 890   if (s)
 891     {
 892       if (unibyte)
 893         return s[bytepos];
 894       else
 895         return STRING_CHAR (s + bytepos);
 896     }
 897   else
 898     return FETCH_MULTIBYTE_CHAR (bytepos);
 899 }
 900
 901 /* Fetch and return the character at BYTEPOS/CHARPOS.  If that
 902    character is covered by a display string, treat the entire run of
 903    covered characters as a single character u+FFFC, and return their
 904    combined length in CH_LEN and NCHARS.  DISP_POS specifies the
 905    character position of the next display string, or -1 if not yet
 906    computed.  DISP_PROP_P non-zero means that there's really a display
 907    string at DISP_POS, as opposed to when we searched till DISP_POS
 908    without findingone.  When the next character is at or beyond that
 909    position, the function updates DISP_POS with the position of the
 910    next display string.  STRING->s is the C string to iterate, or NULL
 911    if iterating over a buffer or a Lisp string; in the latter case,
 912    STRING->lstring is the Lisp string.  */
 913 static inline int
 914 bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
 915                  int *disp_prop_p, struct bidi_string_data *string,
 916                  int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars)
 917 {
 918   int ch;
 919   EMACS_INT endpos =
 920     (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
 921   struct text_pos pos;
 922
 923   /* If we got past the last known position of display string, compute
 924      the position of the next one.  That position could be at CHARPOS.  */
 925   if (charpos < endpos && charpos > *disp_pos)
 926     {
 927       SET_TEXT_POS (pos, charpos, bytepos);
 928       *disp_pos = compute_display_string_pos (&pos, string, frame_window_p,
 929                                               disp_prop_p);
 930     }
 931
 932   /* Fetch the character at BYTEPOS.  */
 933   if (charpos >= endpos)
 934     {
 935       ch = BIDI_EOB;
 936       *ch_len = 1;
 937       *nchars = 1;
 938       *disp_pos = endpos;
 939       *disp_prop_p = 0;
 940     }
 941   else if (charpos >= *disp_pos && *disp_prop_p)
 942     {
 943       EMACS_INT disp_end_pos;
 944
 945       /* We don't expect to find ourselves in the middle of a display
 946          property.  Hopefully, it will never be needed.  */
 947       if (charpos > *disp_pos)
 948         abort ();
 949       /* Return the Unicode Object Replacement Character to represent
 950          the entire run of characters covered by the display string.  */
 951       ch = 0xFFFC;
 952       disp_end_pos = compute_display_string_end (*disp_pos, string);
 953       *nchars = disp_end_pos - *disp_pos;
 954       if (*nchars <= 0)
 955         abort ();
 956       if (string->s)
 957         *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
 958                                     disp_end_pos, string->unibyte);
 959       else if (STRINGP (string->lstring))
 960         *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
 961                                     bytepos, disp_end_pos, string->unibyte);
 962       else
 963         *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
 964     }
 965   else
 966     {
 967       if (string->s)
 968         {
 969           int len;
 970
 971           if (!string->unibyte)
 972             {
 973               ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
 974               *ch_len = len;
 975             }
 976           else
 977             {
 978               ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
 979               *ch_len = 1;
 980             }
 981         }
 982       else if (STRINGP (string->lstring))
 983         {
 984           int len;
 985
 986           if (!string->unibyte)
 987             {
 988               ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
 989                                            len);
 990               *ch_len = len;
 991             }
 992           else
 993             {
 994               ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
 995               *ch_len = 1;
 996             }
 997         }
 998       else
 999         {
1000           ch = FETCH_MULTIBYTE_CHAR (bytepos);
1001           *ch_len = CHAR_BYTES (ch);
1002         }
1003       *nchars = 1;
1004     }
1005
1006   /* If we just entered a run of characters covered by a display
1007      string, compute the position of the next display string.  */
1008   if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
1009       && *disp_prop_p)
1010     {
1011       SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
1012       *disp_pos = compute_display_string_pos (&pos, string, frame_window_p,
1013                                               disp_prop_p);
1014     }
1015
1016   return ch;
1017 }
1018
1019 \f
1020 /***********************************************************************
1021                         Determining paragraph direction
1022  ***********************************************************************/
1023
1024 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1025    Value is the non-negative length of the paragraph separator
1026    following the buffer position, -1 if position is at the beginning
1027    of a new paragraph, or -2 if position is neither at beginning nor
1028    at end of a paragraph.  */
1029 static EMACS_INT
1030 bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
1031 {
1032   Lisp_Object sep_re;
1033   Lisp_Object start_re;
1034   EMACS_INT val;
1035
1036   sep_re = paragraph_separate_re;
1037   start_re = paragraph_start_re;
1038
1039   val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1040   if (val < 0)
1041     {
1042       if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1043         val = -1;
1044       else
1045         val = -2;
1046     }
1047
1048   return val;
1049 }
1050
1051 /* Find the beginning of this paragraph by looking back in the buffer.
1052    Value is the byte position of the paragraph's beginning.  */
1053 static EMACS_INT
1054 bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
1055 {
1056   Lisp_Object re = paragraph_start_re;
1057   EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
1058
1059   while (pos_byte > BEGV_BYTE
1060          && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1061     {
1062       /* FIXME: What if the paragraph beginning is covered by a
1063          display string?  And what if a display string covering some
1064          of the text over which we scan back includes
1065          paragraph_start_re?  */
1066       pos = find_next_newline_no_quit (pos - 1, -1);
1067       pos_byte = CHAR_TO_BYTE (pos);
1068     }
1069   return pos_byte;
1070 }
1071
1072 /* Determine the base direction, a.k.a. base embedding level, of the
1073    paragraph we are about to iterate through.  If DIR is either L2R or
1074    R2L, just use that.  Otherwise, determine the paragraph direction
1075    from the first strong directional character of the paragraph.
1076
1077    NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
1078    has no strong directional characters and both DIR and
1079    bidi_it->paragraph_dir are NEUTRAL_DIR.  In that case, search back
1080    in the buffer until a paragraph is found with a strong character,
1081    or until hitting BEGV.  In the latter case, fall back to L2R.  This
1082    flag is used in current-bidi-paragraph-direction.
1083
1084    Note that this function gives the paragraph separator the same
1085    direction as the preceding paragraph, even though Emacs generally
1086    views the separartor as not belonging to any paragraph.  */
1087 void
1088 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
1089 {
1090   EMACS_INT bytepos = bidi_it->bytepos;
1091   int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
1092   EMACS_INT pstartbyte;
1093   /* Note that begbyte is a byte position, while end is a character
1094      position.  Yes, this is ugly, but we are trying to avoid costly
1095      calls to BYTE_TO_CHAR and its ilk.  */
1096   EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE;
1097   EMACS_INT end = string_p ? bidi_it->string.schars : ZV;
1098
1099   /* Special case for an empty buffer. */
1100   if (bytepos == begbyte && bidi_it->charpos == end)
1101     dir = L2R;
1102   /* We should never be called at EOB or before BEGV.  */
1103   else if (bidi_it->charpos >= end || bytepos < begbyte)
1104     abort ();
1105
1106   if (dir == L2R)
1107     {
1108       bidi_it->paragraph_dir = L2R;
1109       bidi_it->new_paragraph = 0;
1110     }
1111   else if (dir == R2L)
1112     {
1113       bidi_it->paragraph_dir = R2L;
1114       bidi_it->new_paragraph = 0;
1115     }
1116   else if (dir == NEUTRAL_DIR)  /* P2 */
1117     {
1118       int ch;
1119       EMACS_INT ch_len, nchars;
1120       EMACS_INT pos, disp_pos = -1;
1121       int disp_prop_p = 0;
1122       bidi_type_t type;
1123       const unsigned char *s;
1124
1125       if (!bidi_initialized)
1126         bidi_initialize ();
1127
1128       /* If we are inside a paragraph separator, we are just waiting
1129          for the separator to be exhausted; use the previous paragraph
1130          direction.  But don't do that if we have been just reseated,
1131          because we need to reinitialize below in that case.  */
1132       if (!bidi_it->first_elt
1133           && bidi_it->charpos < bidi_it->separator_limit)
1134         return;
1135
1136       /* If we are on a newline, get past it to where the next
1137          paragraph might start.  But don't do that at BEGV since then
1138          we are potentially in a new paragraph that doesn't yet
1139          exist.  */
1140       pos = bidi_it->charpos;
1141       s = STRINGP (bidi_it->string.lstring) ?
1142         SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1143       if (bytepos > begbyte
1144           && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1145         {
1146           bytepos++;
1147           pos++;
1148         }
1149
1150       /* We are either at the beginning of a paragraph or in the
1151          middle of it.  Find where this paragraph starts.  */
1152       if (string_p)
1153         {
1154           /* We don't support changes of paragraph direction inside a
1155              string.  It is treated as a single paragraph.  */
1156           pstartbyte = 0;
1157         }
1158       else
1159         pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1160       bidi_it->separator_limit = -1;
1161       bidi_it->new_paragraph = 0;
1162
1163       /* The following loop is run more than once only if NO_DEFAULT_P
1164          is non-zero, and only if we are iterating on a buffer.  */
1165       do {
1166         bytepos = pstartbyte;
1167         if (!string_p)
1168           pos = BYTE_TO_CHAR (bytepos);
1169         ch = bidi_fetch_char (bytepos, pos, &disp_pos, &disp_prop_p,
1170                               &bidi_it->string,
1171                               bidi_it->frame_window_p, &ch_len, &nchars);
1172         type = bidi_get_type (ch, NEUTRAL_DIR);
1173
1174         for (pos += nchars, bytepos += ch_len;
1175              (bidi_get_category (type) != STRONG)
1176                || (bidi_ignore_explicit_marks_for_paragraph_level
1177                    && (type == RLE || type == RLO
1178                        || type == LRE || type == LRO));
1179              type = bidi_get_type (ch, NEUTRAL_DIR))
1180           {
1181             if (pos >= end)
1182               {
1183                 /* Pretend there's a paragraph separator at end of
1184                    buffer/string.  */
1185                 type = NEUTRAL_B;
1186                 break;
1187               }
1188             if (!string_p
1189                 && type == NEUTRAL_B
1190                 && bidi_at_paragraph_end (pos, bytepos) >= -1)
1191               break;
1192             /* Fetch next character and advance to get past it.  */
1193             ch = bidi_fetch_char (bytepos, pos, &disp_pos,
1194                                   &disp_prop_p, &bidi_it->string,
1195                                   bidi_it->frame_window_p, &ch_len, &nchars);
1196             pos += nchars;
1197             bytepos += ch_len;
1198           }
1199         if ((type == STRONG_R || type == STRONG_AL) /* P3 */
1200             || (!bidi_ignore_explicit_marks_for_paragraph_level
1201                 && (type == RLO || type == RLE)))
1202           bidi_it->paragraph_dir = R2L;
1203         else if (type == STRONG_L
1204                  || (!bidi_ignore_explicit_marks_for_paragraph_level
1205                      && (type == LRO || type == LRE)))
1206           bidi_it->paragraph_dir = L2R;
1207         if (!string_p
1208             && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1209           {
1210             /* If this paragraph is at BEGV, default to L2R.  */
1211             if (pstartbyte == BEGV_BYTE)
1212               bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1213             else
1214               {
1215                 EMACS_INT prevpbyte = pstartbyte;
1216                 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1217
1218                 /* Find the beginning of the previous paragraph, if any.  */
1219                 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1220                   {
1221                     /* FXIME: What if p is covered by a display
1222                        string?  See also a FIXME inside
1223                        bidi_find_paragraph_start.  */
1224                     p--;
1225                     pbyte = CHAR_TO_BYTE (p);
1226                     prevpbyte = bidi_find_paragraph_start (p, pbyte);
1227                   }
1228                 pstartbyte = prevpbyte;
1229               }
1230           }
1231       } while (!string_p
1232                && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1233     }
1234   else
1235     abort ();
1236
1237   /* Contrary to UAX#9 clause P3, we only default the paragraph
1238      direction to L2R if we have no previous usable paragraph
1239      direction.  This is allowed by the HL1 clause.  */
1240   if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1241     bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1242   if (bidi_it->paragraph_dir == R2L)
1243     bidi_it->level_stack[0].level = 1;
1244   else
1245     bidi_it->level_stack[0].level = 0;
1246
1247   bidi_line_init (bidi_it);
1248 }
1249
1250 \f
1251 /***********************************************************************
1252                  Resolving explicit and implicit levels.
1253   The rest of this file constitutes the core of the UBA implementation.
1254  ***********************************************************************/
1255
1256 static inline int
1257 bidi_explicit_dir_char (int ch)
1258 {
1259   bidi_type_t ch_type;
1260
1261   if (!bidi_initialized)
1262     abort ();
1263   ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1264   return (ch_type == LRE || ch_type == LRO
1265           || ch_type == RLE || ch_type == RLO
1266           || ch_type == PDF);
1267 }
1268
1269 /* A helper function for bidi_resolve_explicit.  It advances to the
1270    next character in logical order and determines the new embedding
1271    level and directional override, but does not take into account
1272    empty embeddings.  */
1273 static int
1274 bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1275 {
1276   int curchar;
1277   bidi_type_t type;
1278   int current_level;
1279   int new_level;
1280   bidi_dir_t override;
1281   int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
1282
1283   /* If reseat()'ed, don't advance, so as to start iteration from the
1284      position where we were reseated.  bidi_it->bytepos can be less
1285      than BEGV_BYTE after reseat to BEGV.  */
1286   if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1287       || bidi_it->first_elt)
1288     {
1289       bidi_it->first_elt = 0;
1290       if (string_p)
1291         {
1292           const unsigned char *p =
1293             STRINGP (bidi_it->string.lstring)
1294             ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1295
1296           if (bidi_it->charpos < 0)
1297             bidi_it->charpos = 0;
1298           bidi_it->bytepos = bidi_count_bytes (p, 0, 0, bidi_it->charpos,
1299                                                bidi_it->string.unibyte);
1300         }
1301       else
1302         {
1303           if (bidi_it->charpos < BEGV)
1304             bidi_it->charpos = BEGV;
1305           bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1306         }
1307     }
1308   /* Don't move at end of buffer/string.  */
1309   else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1310     {
1311       /* Advance to the next character, skipping characters covered by
1312          display strings (nchars > 1).  */
1313       if (bidi_it->nchars <= 0)
1314         abort ();
1315       bidi_it->charpos += bidi_it->nchars;
1316       if (bidi_it->ch_len == 0)
1317         abort ();
1318       bidi_it->bytepos += bidi_it->ch_len;
1319     }
1320
1321   current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1322   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1323   new_level = current_level;
1324
1325   if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1326     {
1327       curchar = BIDI_EOB;
1328       bidi_it->ch_len = 1;
1329       bidi_it->nchars = 1;
1330       bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1331       bidi_it->disp_prop_p = 0;
1332     }
1333   else
1334     {
1335       /* Fetch the character at BYTEPOS.  If it is covered by a
1336          display string, treat the entire run of covered characters as
1337          a single character u+FFFC.  */
1338       curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
1339                                  &bidi_it->disp_pos, &bidi_it->disp_prop_p,
1340                                  &bidi_it->string, bidi_it->frame_window_p,
1341                                  &bidi_it->ch_len, &bidi_it->nchars);
1342     }
1343   bidi_it->ch = curchar;
1344
1345   /* Don't apply directional override here, as all the types we handle
1346      below will not be affected by the override anyway, and we need
1347      the original type unaltered.  The override will be applied in
1348      bidi_resolve_weak.  */
1349   type = bidi_get_type (curchar, NEUTRAL_DIR);
1350   bidi_it->orig_type = type;
1351   bidi_check_type (bidi_it->orig_type);
1352
1353   if (type != PDF)
1354     bidi_it->prev_was_pdf = 0;
1355
1356   bidi_it->type_after_w1 = UNKNOWN_BT;
1357
1358   switch (type)
1359     {
1360       case RLE: /* X2 */
1361       case RLO: /* X4 */
1362         bidi_it->type_after_w1 = type;
1363         bidi_check_type (bidi_it->type_after_w1);
1364         type = WEAK_BN; /* X9/Retaining */
1365         if (bidi_it->ignore_bn_limit <= -1)
1366           {
1367             if (current_level <= BIDI_MAXLEVEL - 4)
1368               {
1369                 /* Compute the least odd embedding level greater than
1370                    the current level.  */
1371                 new_level = ((current_level + 1) & ~1) + 1;
1372                 if (bidi_it->type_after_w1 == RLE)
1373                   override = NEUTRAL_DIR;
1374                 else
1375                   override = R2L;
1376                 if (current_level == BIDI_MAXLEVEL - 4)
1377                   bidi_it->invalid_rl_levels = 0;
1378                 bidi_push_embedding_level (bidi_it, new_level, override);
1379               }
1380             else
1381               {
1382                 bidi_it->invalid_levels++;
1383                 /* See the commentary about invalid_rl_levels below.  */
1384                 if (bidi_it->invalid_rl_levels < 0)
1385                   bidi_it->invalid_rl_levels = 0;
1386                 bidi_it->invalid_rl_levels++;
1387               }
1388           }
1389         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1390                  || bidi_it->next_en_pos > bidi_it->charpos)
1391           type = WEAK_EN;
1392         break;
1393       case LRE: /* X3 */
1394       case LRO: /* X5 */
1395         bidi_it->type_after_w1 = type;
1396         bidi_check_type (bidi_it->type_after_w1);
1397         type = WEAK_BN; /* X9/Retaining */
1398         if (bidi_it->ignore_bn_limit <= -1)
1399           {
1400             if (current_level <= BIDI_MAXLEVEL - 5)
1401               {
1402                 /* Compute the least even embedding level greater than
1403                    the current level.  */
1404                 new_level = ((current_level + 2) & ~1);
1405                 if (bidi_it->type_after_w1 == LRE)
1406                   override = NEUTRAL_DIR;
1407                 else
1408                   override = L2R;
1409                 bidi_push_embedding_level (bidi_it, new_level, override);
1410               }
1411             else
1412               {
1413                 bidi_it->invalid_levels++;
1414                 /* invalid_rl_levels counts invalid levels encountered
1415                    while the embedding level was already too high for
1416                    LRE/LRO, but not for RLE/RLO.  That is because
1417                    there may be exactly one PDF which we should not
1418                    ignore even though invalid_levels is non-zero.
1419                    invalid_rl_levels helps to know what PDF is
1420                    that.  */
1421                 if (bidi_it->invalid_rl_levels >= 0)
1422                   bidi_it->invalid_rl_levels++;
1423               }
1424           }
1425         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1426                  || bidi_it->next_en_pos > bidi_it->charpos)
1427           type = WEAK_EN;
1428         break;
1429       case PDF: /* X7 */
1430         bidi_it->type_after_w1 = type;
1431         bidi_check_type (bidi_it->type_after_w1);
1432         type = WEAK_BN; /* X9/Retaining */
1433         if (bidi_it->ignore_bn_limit <= -1)
1434           {
1435             if (!bidi_it->invalid_rl_levels)
1436               {
1437                 new_level = bidi_pop_embedding_level (bidi_it);
1438                 bidi_it->invalid_rl_levels = -1;
1439                 if (bidi_it->invalid_levels)
1440                   bidi_it->invalid_levels--;
1441                 /* else nothing: UAX#9 says to ignore invalid PDFs */
1442               }
1443             if (!bidi_it->invalid_levels)
1444               new_level = bidi_pop_embedding_level (bidi_it);
1445             else
1446               {
1447                 bidi_it->invalid_levels--;
1448                 bidi_it->invalid_rl_levels--;
1449               }
1450           }
1451         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1452                  || bidi_it->next_en_pos > bidi_it->charpos)
1453           type = WEAK_EN;
1454         break;
1455       default:
1456         /* Nothing.  */
1457         break;
1458     }
1459
1460   bidi_it->type = type;
1461   bidi_check_type (bidi_it->type);
1462
1463   return new_level;
1464 }
1465
1466 /* Given an iterator state in BIDI_IT, advance one character position
1467    in the buffer/string to the next character (in the logical order),
1468    resolve any explicit embeddings and directional overrides, and
1469    return the embedding level of the character after resolving
1470    explicit directives and ignoring empty embeddings.  */
1471 static int
1472 bidi_resolve_explicit (struct bidi_it *bidi_it)
1473 {
1474   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1475   int new_level  = bidi_resolve_explicit_1 (bidi_it);
1476   EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
1477   const unsigned char *s = STRINGP (bidi_it->string.lstring)
1478     ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1479
1480   if (prev_level < new_level
1481       && bidi_it->type == WEAK_BN
1482       && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1483       && bidi_it->charpos < eob         /* not already at EOB */
1484       && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1485                                                    + bidi_it->ch_len, s,
1486                                                    bidi_it->string.unibyte)))
1487     {
1488       /* Avoid pushing and popping embedding levels if the level run
1489          is empty, as this breaks level runs where it shouldn't.
1490          UAX#9 removes all the explicit embedding and override codes,
1491          so empty embeddings disappear without a trace.  We need to
1492          behave as if we did the same.  */
1493       struct bidi_it saved_it;
1494       int level = prev_level;
1495
1496       bidi_copy_it (&saved_it, bidi_it);
1497
1498       while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1499                                                        + bidi_it->ch_len, s,
1500                                                        bidi_it->string.unibyte)))
1501         {
1502           /* This advances to the next character, skipping any
1503              characters covered by display strings.  */
1504           level = bidi_resolve_explicit_1 (bidi_it);
1505           /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1506              a pointer to its data is no longer valid.  */
1507           if (STRINGP (bidi_it->string.lstring))
1508             s = SDATA (bidi_it->string.lstring);
1509         }
1510
1511       if (bidi_it->nchars <= 0)
1512         abort ();
1513       if (level == prev_level)  /* empty embedding */
1514         saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
1515       else                      /* this embedding is non-empty */
1516         saved_it.ignore_bn_limit = -2;
1517
1518       bidi_copy_it (bidi_it, &saved_it);
1519       if (bidi_it->ignore_bn_limit > -1)
1520         {
1521           /* We pushed a level, but we shouldn't have.  Undo that. */
1522           if (!bidi_it->invalid_rl_levels)
1523             {
1524               new_level = bidi_pop_embedding_level (bidi_it);
1525               bidi_it->invalid_rl_levels = -1;
1526               if (bidi_it->invalid_levels)
1527                 bidi_it->invalid_levels--;
1528             }
1529           if (!bidi_it->invalid_levels)
1530             new_level = bidi_pop_embedding_level (bidi_it);
1531           else
1532             {
1533               bidi_it->invalid_levels--;
1534               bidi_it->invalid_rl_levels--;
1535             }
1536         }
1537     }
1538
1539   if (bidi_it->type == NEUTRAL_B)       /* X8 */
1540     {
1541       bidi_set_paragraph_end (bidi_it);
1542       /* This is needed by bidi_resolve_weak below, and in L1.  */
1543       bidi_it->type_after_w1 = bidi_it->type;
1544       bidi_check_type (bidi_it->type_after_w1);
1545     }
1546
1547   return new_level;
1548 }
1549
1550 /* Advance in the buffer/string, resolve weak types and return the
1551    type of the next character after weak type resolution.  */
1552 static bidi_type_t
1553 bidi_resolve_weak (struct bidi_it *bidi_it)
1554 {
1555   bidi_type_t type;
1556   bidi_dir_t override;
1557   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1558   int new_level  = bidi_resolve_explicit (bidi_it);
1559   int next_char;
1560   bidi_type_t type_of_next;
1561   struct bidi_it saved_it;
1562   EMACS_INT eob =
1563     (STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1564     ? bidi_it->string.schars : ZV;
1565
1566   type = bidi_it->type;
1567   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1568
1569   if (type == UNKNOWN_BT
1570       || type == LRE
1571       || type == LRO
1572       || type == RLE
1573       || type == RLO
1574       || type == PDF)
1575     abort ();
1576
1577   if (new_level != prev_level
1578       || bidi_it->type == NEUTRAL_B)
1579     {
1580       /* We've got a new embedding level run, compute the directional
1581          type of sor and initialize per-run variables (UAX#9, clause
1582          X10).  */
1583       bidi_set_sor_type (bidi_it, prev_level, new_level);
1584     }
1585   else if (type == NEUTRAL_S || type == NEUTRAL_WS
1586            || type == WEAK_BN || type == STRONG_AL)
1587     bidi_it->type_after_w1 = type;      /* needed in L1 */
1588   bidi_check_type (bidi_it->type_after_w1);
1589
1590   /* Level and directional override status are already recorded in
1591      bidi_it, and do not need any change; see X6.  */
1592   if (override == R2L)          /* X6 */
1593     type = STRONG_R;
1594   else if (override == L2R)
1595     type = STRONG_L;
1596   else
1597     {
1598       if (type == WEAK_NSM)     /* W1 */
1599         {
1600           /* Note that we don't need to consider the case where the
1601              prev character has its type overridden by an RLO or LRO,
1602              because then either the type of this NSM would have been
1603              also overridden, or the previous character is outside the
1604              current level run, and thus not relevant to this NSM.
1605              This is why NSM gets the type_after_w1 of the previous
1606              character.  */
1607           if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1608               /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1609               && bidi_it->prev.type_after_w1 != NEUTRAL_B)
1610             type = bidi_it->prev.type_after_w1;
1611           else if (bidi_it->sor == R2L)
1612             type = STRONG_R;
1613           else if (bidi_it->sor == L2R)
1614             type = STRONG_L;
1615           else /* shouldn't happen! */
1616             abort ();
1617         }
1618       if (type == WEAK_EN       /* W2 */
1619           && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1620         type = WEAK_AN;
1621       else if (type == STRONG_AL) /* W3 */
1622         type = STRONG_R;
1623       else if ((type == WEAK_ES /* W4 */
1624                 && bidi_it->prev.type_after_w1 == WEAK_EN
1625                 && bidi_it->prev.orig_type == WEAK_EN)
1626                || (type == WEAK_CS
1627                    && ((bidi_it->prev.type_after_w1 == WEAK_EN
1628                         && bidi_it->prev.orig_type == WEAK_EN)
1629                        || bidi_it->prev.type_after_w1 == WEAK_AN)))
1630         {
1631           const unsigned char *s =
1632             STRINGP (bidi_it->string.lstring)
1633             ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1634
1635           next_char =
1636             bidi_it->charpos + bidi_it->nchars >= eob
1637             ? BIDI_EOB
1638             : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
1639                                 bidi_it->string.unibyte);
1640           type_of_next = bidi_get_type (next_char, override);
1641
1642           if (type_of_next == WEAK_BN
1643               || bidi_explicit_dir_char (next_char))
1644             {
1645               bidi_copy_it (&saved_it, bidi_it);
1646               while (bidi_resolve_explicit (bidi_it) == new_level
1647                      && bidi_it->type == WEAK_BN)
1648                 ;
1649               type_of_next = bidi_it->type;
1650               bidi_copy_it (bidi_it, &saved_it);
1651             }
1652
1653           /* If the next character is EN, but the last strong-type
1654              character is AL, that next EN will be changed to AN when
1655              we process it in W2 above.  So in that case, this ES
1656              should not be changed into EN.  */
1657           if (type == WEAK_ES
1658               && type_of_next == WEAK_EN
1659               && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1660             type = WEAK_EN;
1661           else if (type == WEAK_CS)
1662             {
1663               if (bidi_it->prev.type_after_w1 == WEAK_AN
1664                   && (type_of_next == WEAK_AN
1665                       /* If the next character is EN, but the last
1666                          strong-type character is AL, EN will be later
1667                          changed to AN when we process it in W2 above.
1668                          So in that case, this ES should not be
1669                          changed into EN.  */
1670                       || (type_of_next == WEAK_EN
1671                           && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1672                 type = WEAK_AN;
1673               else if (bidi_it->prev.type_after_w1 == WEAK_EN
1674                        && type_of_next == WEAK_EN
1675                        && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1676                 type = WEAK_EN;
1677             }
1678         }
1679       else if (type == WEAK_ET  /* W5: ET with EN before or after it */
1680                || type == WEAK_BN)      /* W5/Retaining */
1681         {
1682           if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1683               || bidi_it->next_en_pos > bidi_it->charpos)
1684             type = WEAK_EN;
1685           else                  /* W5: ET/BN with EN after it.  */
1686             {
1687               EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
1688               const unsigned char *s =
1689                 STRINGP (bidi_it->string.lstring)
1690                 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1691
1692               if (bidi_it->nchars <= 0)
1693                 abort ();
1694               next_char =
1695                 bidi_it->charpos + bidi_it->nchars >= eob
1696                 ? BIDI_EOB
1697                 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
1698                                     bidi_it->string.unibyte);
1699               type_of_next = bidi_get_type (next_char, override);
1700
1701               if (type_of_next == WEAK_ET
1702                   || type_of_next == WEAK_BN
1703                   || bidi_explicit_dir_char (next_char))
1704                 {
1705                   bidi_copy_it (&saved_it, bidi_it);
1706                   while (bidi_resolve_explicit (bidi_it) == new_level
1707                          && (bidi_it->type == WEAK_BN
1708                              || bidi_it->type == WEAK_ET))
1709                     ;
1710                   type_of_next = bidi_it->type;
1711                   en_pos = bidi_it->charpos;
1712                   bidi_copy_it (bidi_it, &saved_it);
1713                 }
1714               if (type_of_next == WEAK_EN)
1715                 {
1716                   /* If the last strong character is AL, the EN we've
1717                      found will become AN when we get to it (W2). */
1718                   if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1719                     {
1720                       type = WEAK_EN;
1721                       /* Remember this EN position, to speed up processing
1722                          of the next ETs.  */
1723                       bidi_it->next_en_pos = en_pos;
1724                     }
1725                   else if (type == WEAK_BN)
1726                     type = NEUTRAL_ON; /* W6/Retaining */
1727                 }
1728             }
1729         }
1730     }
1731
1732   if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
1733       || (type == WEAK_BN
1734           && (bidi_it->prev.type_after_w1 == WEAK_CS        /* W6/Retaining */
1735               || bidi_it->prev.type_after_w1 == WEAK_ES
1736               || bidi_it->prev.type_after_w1 == WEAK_ET)))
1737     type = NEUTRAL_ON;
1738
1739   /* Store the type we've got so far, before we clobber it with strong
1740      types in W7 and while resolving neutral types.  But leave alone
1741      the original types that were recorded above, because we will need
1742      them for the L1 clause.  */
1743   if (bidi_it->type_after_w1 == UNKNOWN_BT)
1744     bidi_it->type_after_w1 = type;
1745   bidi_check_type (bidi_it->type_after_w1);
1746
1747   if (type == WEAK_EN)  /* W7 */
1748     {
1749       if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
1750           || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1751         type = STRONG_L;
1752     }
1753
1754   bidi_it->type = type;
1755   bidi_check_type (bidi_it->type);
1756   return type;
1757 }
1758
1759 /* Resolve the type of a neutral character according to the type of
1760    surrounding strong text and the current embedding level.  */
1761 static inline bidi_type_t
1762 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1763 {
1764   /* N1: European and Arabic numbers are treated as though they were R.  */
1765   if (next_type == WEAK_EN || next_type == WEAK_AN)
1766     next_type = STRONG_R;
1767   if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1768     prev_type = STRONG_R;
1769
1770   if (next_type == prev_type)   /* N1 */
1771     return next_type;
1772   else if ((lev & 1) == 0)      /* N2 */
1773     return STRONG_L;
1774   else
1775     return STRONG_R;
1776 }
1777
1778 static bidi_type_t
1779 bidi_resolve_neutral (struct bidi_it *bidi_it)
1780 {
1781   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1782   bidi_type_t type = bidi_resolve_weak (bidi_it);
1783   int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1784
1785   if (!(type == STRONG_R
1786         || type == STRONG_L
1787         || type == WEAK_BN
1788         || type == WEAK_EN
1789         || type == WEAK_AN
1790         || type == NEUTRAL_B
1791         || type == NEUTRAL_S
1792         || type == NEUTRAL_WS
1793         || type == NEUTRAL_ON))
1794     abort ();
1795
1796   if (bidi_get_category (type) == NEUTRAL
1797       || (type == WEAK_BN && prev_level == current_level))
1798     {
1799       if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1800         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1801                                        bidi_it->next_for_neutral.type,
1802                                        current_level);
1803       else
1804         {
1805           /* Arrrgh!!  The UAX#9 algorithm is too deeply entrenched in
1806              the assumption of batch-style processing; see clauses W4,
1807              W5, and especially N1, which require to look far forward
1808              (as well as back) in the buffer/string.  May the fleas of
1809              a thousand camels infest the armpits of those who design
1810              supposedly general-purpose algorithms by looking at their
1811              own implementations, and fail to consider other possible
1812              implementations!  */
1813           struct bidi_it saved_it;
1814           bidi_type_t next_type;
1815
1816           if (bidi_it->scan_dir == -1)
1817             abort ();
1818
1819           bidi_copy_it (&saved_it, bidi_it);
1820           /* Scan the text forward until we find the first non-neutral
1821              character, and then use that to resolve the neutral we
1822              are dealing with now.  We also cache the scanned iterator
1823              states, to salvage some of the effort later.  */
1824           bidi_cache_iterator_state (bidi_it, 0);
1825           do {
1826             /* Record the info about the previous character, so that
1827                it will be cached below with this state.  */
1828             if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1829                 && bidi_it->type != WEAK_BN)
1830               bidi_remember_char (&bidi_it->prev, bidi_it);
1831             type = bidi_resolve_weak (bidi_it);
1832             /* Paragraph separators have their levels fully resolved
1833                at this point, so cache them as resolved.  */
1834             bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1835             /* FIXME: implement L1 here, by testing for a newline and
1836                resetting the level for any sequence of whitespace
1837                characters adjacent to it.  */
1838           } while (!(type == NEUTRAL_B
1839                      || (type != WEAK_BN
1840                          && bidi_get_category (type) != NEUTRAL)
1841                      /* This is all per level run, so stop when we
1842                         reach the end of this level run.  */
1843                      || bidi_it->level_stack[bidi_it->stack_idx].level !=
1844                      current_level));
1845
1846           bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1847
1848           switch (type)
1849             {
1850               case STRONG_L:
1851               case STRONG_R:
1852               case STRONG_AL:
1853                 next_type = type;
1854                 break;
1855               case WEAK_EN:
1856               case WEAK_AN:
1857                 /* N1: ``European and Arabic numbers are treated as
1858                    though they were R.''  */
1859                 next_type = STRONG_R;
1860                 saved_it.next_for_neutral.type = STRONG_R;
1861                 break;
1862               case WEAK_BN:
1863                 if (!bidi_explicit_dir_char (bidi_it->ch))
1864                   abort ();             /* can't happen: BNs are skipped */
1865                 /* FALLTHROUGH */
1866               case NEUTRAL_B:
1867                 /* Marched all the way to the end of this level run.
1868                    We need to use the eor type, whose information is
1869                    stored by bidi_set_sor_type in the prev_for_neutral
1870                    member.  */
1871                 if (saved_it.type != WEAK_BN
1872                     || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
1873                   {
1874                     next_type = bidi_it->prev_for_neutral.type;
1875                     saved_it.next_for_neutral.type = next_type;
1876                     bidi_check_type (next_type);
1877                   }
1878                 else
1879                   {
1880                     /* This is a BN which does not adjoin neutrals.
1881                        Leave its type alone.  */
1882                     bidi_copy_it (bidi_it, &saved_it);
1883                     return bidi_it->type;
1884                   }
1885                 break;
1886               default:
1887                 abort ();
1888             }
1889           type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1890                                          next_type, current_level);
1891           saved_it.type = type;
1892           bidi_check_type (type);
1893           bidi_copy_it (bidi_it, &saved_it);
1894         }
1895     }
1896   return type;
1897 }
1898
1899 /* Given an iterator state in BIDI_IT, advance one character position
1900    in the buffer/string to the next character (in the logical order),
1901    resolve the bidi type of that next character, and return that
1902    type.  */
1903 static bidi_type_t
1904 bidi_type_of_next_char (struct bidi_it *bidi_it)
1905 {
1906   bidi_type_t type;
1907
1908   /* This should always be called during a forward scan.  */
1909   if (bidi_it->scan_dir != 1)
1910     abort ();
1911
1912   /* Reset the limit until which to ignore BNs if we step out of the
1913      area where we found only empty levels.  */
1914   if ((bidi_it->ignore_bn_limit > -1
1915        && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1916       || (bidi_it->ignore_bn_limit == -2
1917           && !bidi_explicit_dir_char (bidi_it->ch)))
1918     bidi_it->ignore_bn_limit = -1;
1919
1920   type = bidi_resolve_neutral (bidi_it);
1921
1922   return type;
1923 }
1924
1925 /* Given an iterator state BIDI_IT, advance one character position in
1926    the buffer/string to the next character (in the current scan
1927    direction), resolve the embedding and implicit levels of that next
1928    character, and return the resulting level.  */
1929 static int
1930 bidi_level_of_next_char (struct bidi_it *bidi_it)
1931 {
1932   bidi_type_t type;
1933   int level, prev_level = -1;
1934   struct bidi_saved_info next_for_neutral;
1935   EMACS_INT next_char_pos = -2;
1936
1937   if (bidi_it->scan_dir == 1)
1938     {
1939       EMACS_INT eob =
1940         (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1941         ? bidi_it->string.schars : ZV;
1942
1943       /* There's no sense in trying to advance if we hit end of text.  */
1944       if (bidi_it->charpos >= eob)
1945         return bidi_it->resolved_level;
1946
1947       /* Record the info about the previous character.  */
1948       if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1949           && bidi_it->type != WEAK_BN)
1950         bidi_remember_char (&bidi_it->prev, bidi_it);
1951       if (bidi_it->type_after_w1 == STRONG_R
1952           || bidi_it->type_after_w1 == STRONG_L
1953           || bidi_it->type_after_w1 == STRONG_AL)
1954         bidi_remember_char (&bidi_it->last_strong, bidi_it);
1955       /* FIXME: it sounds like we don't need both prev and
1956          prev_for_neutral members, but I'm leaving them both for now.  */
1957       if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1958           || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1959         bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1960
1961       /* If we overstepped the characters used for resolving neutrals
1962          and whitespace, invalidate their info in the iterator.  */
1963       if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1964         bidi_it->next_for_neutral.type = UNKNOWN_BT;
1965       if (bidi_it->next_en_pos >= 0
1966           && bidi_it->charpos >= bidi_it->next_en_pos)
1967         bidi_it->next_en_pos = -1;
1968       if (bidi_it->next_for_ws.type != UNKNOWN_BT
1969           && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1970         bidi_it->next_for_ws.type = UNKNOWN_BT;
1971
1972       /* This must be taken before we fill the iterator with the info
1973          about the next char.  If we scan backwards, the iterator
1974          state must be already cached, so there's no need to know the
1975          embedding level of the previous character, since we will be
1976          returning to our caller shortly.  */
1977       prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1978     }
1979   next_for_neutral = bidi_it->next_for_neutral;
1980
1981   /* Perhaps the character we want is already cached.  If it is, the
1982      call to bidi_cache_find below will return a type other than
1983      UNKNOWN_BT.  */
1984   if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
1985     {
1986       int bob =
1987         (bidi_it->string.s || STRINGP (bidi_it->string.lstring)) ? 0 : 1;
1988
1989       if (bidi_it->scan_dir > 0)
1990         {
1991           if (bidi_it->nchars <= 0)
1992             abort ();
1993           next_char_pos = bidi_it->charpos + bidi_it->nchars;
1994         }
1995       else if (bidi_it->charpos >= bob)
1996         /* Implementation note: we allow next_char_pos to be as low as
1997            0 for buffers or -1 for strings, and that is okay because
1998            that's the "position" of the sentinel iterator state we
1999            cached at the beginning of the iteration.  */
2000         next_char_pos = bidi_it->charpos - 1;
2001       if (next_char_pos >= bob - 1)
2002         type = bidi_cache_find (next_char_pos, -1, bidi_it);
2003       else
2004         type = UNKNOWN_BT;
2005     }
2006   else
2007     type = UNKNOWN_BT;
2008   if (type != UNKNOWN_BT)
2009     {
2010       /* Don't lose the information for resolving neutrals!  The
2011          cached states could have been cached before their
2012          next_for_neutral member was computed.  If we are on our way
2013          forward, we can simply take the info from the previous
2014          state.  */
2015       if (bidi_it->scan_dir == 1
2016           && bidi_it->next_for_neutral.type == UNKNOWN_BT)
2017         bidi_it->next_for_neutral = next_for_neutral;
2018
2019       /* If resolved_level is -1, it means this state was cached
2020          before it was completely resolved, so we cannot return
2021          it.  */
2022       if (bidi_it->resolved_level != -1)
2023         return bidi_it->resolved_level;
2024     }
2025   if (bidi_it->scan_dir == -1)
2026     /* If we are going backwards, the iterator state is already cached
2027        from previous scans, and should be fully resolved.  */
2028     abort ();
2029
2030   if (type == UNKNOWN_BT)
2031     type = bidi_type_of_next_char (bidi_it);
2032
2033   if (type == NEUTRAL_B)
2034     return bidi_it->resolved_level;
2035
2036   level = bidi_it->level_stack[bidi_it->stack_idx].level;
2037   if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
2038       || (type == WEAK_BN && prev_level == level))
2039     {
2040       if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
2041         abort ();
2042
2043       /* If the cached state shows a neutral character, it was not
2044          resolved by bidi_resolve_neutral, so do it now.  */
2045       type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2046                                      bidi_it->next_for_neutral.type,
2047                                      level);
2048     }
2049
2050   if (!(type == STRONG_R
2051         || type == STRONG_L
2052         || type == WEAK_BN
2053         || type == WEAK_EN
2054         || type == WEAK_AN))
2055     abort ();
2056   bidi_it->type = type;
2057   bidi_check_type (bidi_it->type);
2058
2059   /* For L1 below, we need to know, for each WS character, whether
2060      it belongs to a sequence of WS characters preceding a newline
2061      or a TAB or a paragraph separator.  */
2062   if (bidi_it->orig_type == NEUTRAL_WS
2063       && bidi_it->next_for_ws.type == UNKNOWN_BT)
2064     {
2065       int ch;
2066       EMACS_INT clen = bidi_it->ch_len;
2067       EMACS_INT bpos = bidi_it->bytepos;
2068       EMACS_INT cpos = bidi_it->charpos;
2069       EMACS_INT disp_pos = bidi_it->disp_pos;
2070       EMACS_INT nc = bidi_it->nchars;
2071       struct bidi_string_data bs = bidi_it->string;
2072       bidi_type_t chtype;
2073       int fwp = bidi_it->frame_window_p;
2074       int dpp = bidi_it->disp_prop_p;
2075
2076       if (bidi_it->nchars <= 0)
2077         abort ();
2078       do {
2079         ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &dpp, &bs,
2080                               fwp, &clen, &nc);
2081         if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
2082           chtype = NEUTRAL_B;
2083         else
2084           chtype = bidi_get_type (ch, NEUTRAL_DIR);
2085       } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
2086                || bidi_explicit_dir_char (ch)); /* L1/Retaining */
2087       bidi_it->next_for_ws.type = chtype;
2088       bidi_check_type (bidi_it->next_for_ws.type);
2089       bidi_it->next_for_ws.charpos = cpos;
2090       bidi_it->next_for_ws.bytepos = bpos;
2091     }
2092
2093   /* Resolve implicit levels, with a twist: PDFs get the embedding
2094      level of the enbedding they terminate.  See below for the
2095      reason.  */
2096   if (bidi_it->orig_type == PDF
2097       /* Don't do this if this formatting code didn't change the
2098          embedding level due to invalid or empty embeddings.  */
2099       && prev_level != level)
2100     {
2101       /* Don't look in UAX#9 for the reason for this: it's our own
2102          private quirk.  The reason is that we want the formatting
2103          codes to be delivered so that they bracket the text of their
2104          embedding.  For example, given the text
2105
2106              {RLO}teST{PDF}
2107
2108          we want it to be displayed as
2109
2110              {PDF}STet{RLO}
2111
2112          not as
2113
2114              STet{RLO}{PDF}
2115
2116          which will result because we bump up the embedding level as
2117          soon as we see the RLO and pop it as soon as we see the PDF,
2118          so RLO itself has the same embedding level as "teST", and
2119          thus would be normally delivered last, just before the PDF.
2120          The switch below fiddles with the level of PDF so that this
2121          ugly side effect does not happen.
2122
2123          (This is, of course, only important if the formatting codes
2124          are actually displayed, but Emacs does need to display them
2125          if the user wants to.)  */
2126       level = prev_level;
2127     }
2128   else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
2129            || bidi_it->orig_type == NEUTRAL_S
2130            || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
2131            /* || bidi_it->ch == LINESEP_CHAR */
2132            || (bidi_it->orig_type == NEUTRAL_WS
2133                && (bidi_it->next_for_ws.type == NEUTRAL_B
2134                    || bidi_it->next_for_ws.type == NEUTRAL_S)))
2135     level = bidi_it->level_stack[0].level;
2136   else if ((level & 1) == 0) /* I1 */
2137     {
2138       if (type == STRONG_R)
2139         level++;
2140       else if (type == WEAK_EN || type == WEAK_AN)
2141         level += 2;
2142     }
2143   else                  /* I2 */
2144     {
2145       if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
2146         level++;
2147     }
2148
2149   bidi_it->resolved_level = level;
2150   return level;
2151 }
2152
2153 /* Move to the other edge of a level given by LEVEL.  If END_FLAG is
2154    non-zero, we are at the end of a level, and we need to prepare to
2155    resume the scan of the lower level.
2156
2157    If this level's other edge is cached, we simply jump to it, filling
2158    the iterator structure with the iterator state on the other edge.
2159    Otherwise, we walk the buffer or string until we come back to the
2160    same level as LEVEL.
2161
2162    Note: we are not talking here about a ``level run'' in the UAX#9
2163    sense of the term, but rather about a ``level'' which includes
2164    all the levels higher than it.  In other words, given the levels
2165    like this:
2166
2167          11111112222222333333334443343222222111111112223322111
2168                 A      B                    C
2169
2170    and assuming we are at point A scanning left to right, this
2171    function moves to point C, whereas the UAX#9 ``level 2 run'' ends
2172    at point B.  */
2173 static void
2174 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
2175 {
2176   int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
2177   ptrdiff_t idx;
2178
2179   /* Try the cache first.  */
2180   if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
2181       >= bidi_cache_start)
2182     bidi_cache_fetch_state (idx, bidi_it);
2183   else
2184     {
2185       int new_level;
2186
2187       if (end_flag)
2188         abort (); /* if we are at end of level, its edges must be cached */
2189
2190       bidi_cache_iterator_state (bidi_it, 1);
2191       do {
2192         new_level = bidi_level_of_next_char (bidi_it);
2193         bidi_cache_iterator_state (bidi_it, 1);
2194       } while (new_level >= level);
2195     }
2196 }
2197
2198 void
2199 bidi_move_to_visually_next (struct bidi_it *bidi_it)
2200 {
2201   int old_level, new_level, next_level;
2202   struct bidi_it sentinel;
2203   struct gcpro gcpro1;
2204
2205   if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
2206     abort ();
2207
2208   if (bidi_it->scan_dir == 0)
2209     {
2210       bidi_it->scan_dir = 1;    /* default to logical order */
2211     }
2212
2213   /* The code below can call eval, and thus cause GC.  If we are
2214      iterating a Lisp string, make sure it won't be GCed.  */
2215   if (STRINGP (bidi_it->string.lstring))
2216     GCPRO1 (bidi_it->string.lstring);
2217
2218   /* If we just passed a newline, initialize for the next line.  */
2219   if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
2220     bidi_line_init (bidi_it);
2221
2222   /* Prepare the sentinel iterator state, and cache it.  When we bump
2223      into it, scanning backwards, we'll know that the last non-base
2224      level is exhausted.  */
2225   if (bidi_cache_idx == bidi_cache_start)
2226     {
2227       bidi_copy_it (&sentinel, bidi_it);
2228       if (bidi_it->first_elt)
2229         {
2230           sentinel.charpos--;   /* cached charpos needs to be monotonic */
2231           sentinel.bytepos--;
2232           sentinel.ch = '\n';   /* doesn't matter, but why not? */
2233           sentinel.ch_len = 1;
2234           sentinel.nchars = 1;
2235         }
2236       bidi_cache_iterator_state (&sentinel, 1);
2237     }
2238
2239   old_level = bidi_it->resolved_level;
2240   new_level = bidi_level_of_next_char (bidi_it);
2241
2242   /* Reordering of resolved levels (clause L2) is implemented by
2243      jumping to the other edge of the level and flipping direction of
2244      scanning the text whenever we find a level change.  */
2245   if (new_level != old_level)
2246     {
2247       int ascending = new_level > old_level;
2248       int level_to_search = ascending ? old_level + 1 : old_level;
2249       int incr = ascending ? 1 : -1;
2250       int expected_next_level = old_level + incr;
2251
2252       /* Jump (or walk) to the other edge of this level.  */
2253       bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2254       /* Switch scan direction and peek at the next character in the
2255          new direction.  */
2256       bidi_it->scan_dir = -bidi_it->scan_dir;
2257
2258       /* The following loop handles the case where the resolved level
2259          jumps by more than one.  This is typical for numbers inside a
2260          run of text with left-to-right embedding direction, but can
2261          also happen in other situations.  In those cases the decision
2262          where to continue after a level change, and in what direction,
2263          is tricky.  For example, given a text like below:
2264
2265                   abcdefgh
2266                   11336622
2267
2268          (where the numbers below the text show the resolved levels),
2269          the result of reordering according to UAX#9 should be this:
2270
2271                   efdcghba
2272
2273          This is implemented by the loop below which flips direction
2274          and jumps to the other edge of the level each time it finds
2275          the new level not to be the expected one.  The expected level
2276          is always one more or one less than the previous one.  */
2277       next_level = bidi_peek_at_next_level (bidi_it);
2278       while (next_level != expected_next_level)
2279         {
2280           expected_next_level += incr;
2281           level_to_search += incr;
2282           bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2283           bidi_it->scan_dir = -bidi_it->scan_dir;
2284           next_level = bidi_peek_at_next_level (bidi_it);
2285         }
2286
2287       /* Finally, deliver the next character in the new direction.  */
2288       next_level = bidi_level_of_next_char (bidi_it);
2289     }
2290
2291   /* Take note when we have just processed the newline that precedes
2292      the end of the paragraph.  The next time we are about to be
2293      called, set_iterator_to_next will automatically reinit the
2294      paragraph direction, if needed.  We do this at the newline before
2295      the paragraph separator, because the next character might not be
2296      the first character of the next paragraph, due to the bidi
2297      reordering, whereas we _must_ know the paragraph base direction
2298      _before_ we process the paragraph's text, since the base
2299      direction affects the reordering.  */
2300   if (bidi_it->scan_dir == 1 && bidi_it->orig_type == NEUTRAL_B)
2301     {
2302       /* The paragraph direction of the entire string, once
2303          determined, is in effect for the entire string.  Setting the
2304          separator limit to the end of the string prevents
2305          bidi_paragraph_init from being called automatically on this
2306          string.  */
2307       if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2308         bidi_it->separator_limit = bidi_it->string.schars;
2309       else if (bidi_it->bytepos < ZV_BYTE)
2310         {
2311           EMACS_INT sep_len =
2312             bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2313                                    bidi_it->bytepos + bidi_it->ch_len);
2314           if (bidi_it->nchars <= 0)
2315             abort ();
2316           if (sep_len >= 0)
2317             {
2318               bidi_it->new_paragraph = 1;
2319               /* Record the buffer position of the last character of the
2320                  paragraph separator.  */
2321               bidi_it->separator_limit =
2322                 bidi_it->charpos + bidi_it->nchars + sep_len;
2323             }
2324         }
2325     }
2326
2327   if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
2328     {
2329       /* If we are at paragraph's base embedding level and beyond the
2330          last cached position, the cache's job is done and we can
2331          discard it.  */
2332       if (bidi_it->resolved_level == bidi_it->level_stack[0].level
2333           && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2334                                  + bidi_cache[bidi_cache_idx - 1].nchars - 1))
2335         bidi_cache_reset ();
2336         /* But as long as we are caching during forward scan, we must
2337            cache each state, or else the cache integrity will be
2338            compromised: it assumes cached states correspond to buffer
2339            positions 1:1.  */
2340       else
2341         bidi_cache_iterator_state (bidi_it, 1);
2342     }
2343
2344   if (STRINGP (bidi_it->string.lstring))
2345     UNGCPRO;
2346 }
2347
2348 /* This is meant to be called from within the debugger, whenever you
2349    wish to examine the cache contents.  */
2350 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
2351 void
2352 bidi_dump_cached_states (void)
2353 {
2354   ptrdiff_t i;
2355   int ndigits = 1;
2356
2357   if (bidi_cache_idx == 0)
2358     {
2359       fprintf (stderr, "The cache is empty.\n");
2360       return;
2361     }
2362   fprintf (stderr, "Total of  %"pD"d state%s in cache:\n",
2363            bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2364
2365   for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2366     ndigits++;
2367   fputs ("ch  ", stderr);
2368   for (i = 0; i < bidi_cache_idx; i++)
2369     fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2370   fputs ("\n", stderr);
2371   fputs ("lvl ", stderr);
2372   for (i = 0; i < bidi_cache_idx; i++)
2373     fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2374   fputs ("\n", stderr);
2375   fputs ("pos ", stderr);
2376   for (i = 0; i < bidi_cache_idx; i++)
2377     fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
2378   fputs ("\n", stderr);
2379 }