]> code.delx.au - gnu-emacs/blob - src/bidi.c
d354aa796ae3175de7128a7db163d53d30e04627
[gnu-emacs] / src / bidi.c
1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
2 Copyright (C) 2000-2001, 2004-2005, 2009-2014 Free Software
3 Foundation, Inc.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
19
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 (UBA) as per UAX#9, a part of the Unicode Standard.
24
25 Unlike the Reference Implementation and most other implementations,
26 this one is designed to be called once for every character in the
27 buffer or string. That way, we can leave intact the design of the
28 Emacs display engine, whereby an iterator object is used to
29 traverse buffer or string text character by character, and generate
30 the necessary data for displaying each character in 'struct glyph'
31 objects. (See xdisp.c for the details of that iteration.) The
32 functions on this file replace the original linear iteration in the
33 logical order of the text with a non-linear iteration in the visual
34 order, i.e. in the order characters should be shown on display.
35
36 The main entry point is bidi_move_to_visually_next. Each time it
37 is called, it finds the next character in the visual order, and
38 returns its information in a special structure. The caller is then
39 expected to process this character for display or any other
40 purposes, and call bidi_move_to_visually_next for the next
41 character. See the comments in bidi_move_to_visually_next for more
42 details about its algorithm that finds the next visual-order
43 character by resolving their levels on the fly.
44
45 Two other entry points are bidi_paragraph_init and
46 bidi_mirror_char. The first determines the base direction of a
47 paragraph, while the second returns the mirrored version of its
48 argument character.
49
50 A few auxiliary entry points are used to initialize the bidi
51 iterator for iterating an object (buffer or string), push and pop
52 the bidi iterator state, and save and restore the state of the bidi
53 cache.
54
55 If you want to understand the code, you will have to read it
56 together with the relevant portions of UAX#9. The comments include
57 references to UAX#9 rules, for that very reason.
58
59 A note about references to UAX#9 rules: if the reference says
60 something like "X9/Retaining", it means that you need to refer to
61 rule X9 and to its modifications described in the "Implementation
62 Notes" section of UAX#9, under "Retaining Format Codes".
63
64 Here's the overview of the design of the reordering engine
65 implemented by this file.
66
67 Basic implementation structure
68 ------------------------------
69
70 The sequential processing steps described by UAX#9 are implemented
71 as recursive levels of processing, all of which examine the next
72 character in the logical order. This hierarchy of processing looks
73 as follows, from the innermost (deepest) to the outermost level,
74 omitting some subroutines used by each level:
75
76 bidi_fetch_char -- fetch next character
77 bidi_resolve_explicit -- resolve explicit levels and directions
78 bidi_resolve_weak -- resolve weak types
79 bidi_resolve_brackets -- resolve "paired brackets" neutral types
80 bidi_resolve_neutral -- resolve neutral types
81 bidi_level_of_next_char -- resolve implicit levels
82
83 Each level calls the level below it, and works on the result
84 returned by the lower level, including all of its sub-levels.
85
86 Unlike all the levels below it, bidi_level_of_next_char can return
87 the information about either the next or previous character in the
88 logical order, depending on the current direction of scanning the
89 buffer or string. For the next character, it calls all the levels
90 below it; for the previous character, it uses the cache, described
91 below.
92
93 Thus, the result of calling bidi_level_of_next_char is the resolved
94 level of the next or the previous character in the logical order.
95 Based on this information, the function bidi_move_to_visually_next
96 finds the next character in the visual order and updates the
97 direction in which the buffer is scanned, either forward or
98 backward, to find the next character to be displayed. (Text is
99 scanned backwards when it needs to be reversed for display, i.e. if
100 the visual order is the inverse of the logical order.) This
101 implements the last, reordering steps of the UBA, by successively
102 calling bidi_level_of_next_char until the character of the required
103 embedding level is found; the scan direction is dynamically updated
104 as a side effect. See the commentary before the 'while' loop in
105 bidi_move_to_visually_next, for the details.
106
107 Fetching characters
108 -------------------
109
110 In a nutshell, fetching the next character boils down to calling
111 STRING_CHAR_AND_LENGTH, passing it the address of a buffer or
112 string position. See bidi_fetch_char. However, if the next
113 character is "covered" by a display property of some kind,
114 bidi_fetch_char returns the u+FFFC "object replacement character"
115 that represents the entire run of text covered by the display
116 property. (The ch_len and nchars members of 'struct bidi_it'
117 reflect the length in bytes and characters of that text.) This is
118 so we reorder text on both sides of the display property as
119 appropriate for an image or embedded string. Similarly, text
120 covered by a display spec of the form '(space ...)', is replaced
121 with the u+2029 paragraph separator character, so such display
122 specs produce the same effect as a TAB under UBA. Both these
123 special characters are not actually displayed -- the display
124 property is displayed instead -- but just used to compute the
125 embedding level of the surrounding text so as to produce the
126 required effect.
127
128 Bidi iterator states
129 --------------------
130
131 The UBA is highly context dependent in some of its parts,
132 i.e. results of processing a character can generally depend on
133 characters very far away. The UAX#9 description of the UBA
134 prescribes a stateful processing of each character, whereby the
135 results of this processing depend on various state variables, such
136 as the current embedding level, level stack, and directional
137 override status. In addition, the UAX#9 description includes many
138 passages like this (from rule W2 in this case):
139
140 Search backward from each instance of a European number until the
141 first strong type (R, L, AL, or sos) is found. If an AL is found,
142 change the type of the European number to Arabic number.
143
144 To support this, we use a bidi iterator object, 'struct bidi_it',
145 which is a sub-structure of 'struct it' used by xdisp.c (see
146 dispextern.h for the definition of both of these structures). The
147 bidi iterator holds the entire state of the iteration required by
148 the UBA, and is updated as the text is traversed. In particular,
149 the embedding level of the current character being resolved is
150 recorded in the iterator state. To avoid costly searches backward
151 in support of rules like W2 above, the necessary character types
152 are also recorded in the iterator state as they are found during
153 the forward scan, and then used when such rules need to be applied.
154 (Forward scans cannot be avoided in this way; they need to be
155 performed at least once, and the results recorded in the iterator
156 state, to be reused until the forward scan oversteps the recorded
157 position.)
158
159 In this manner, the iterator state acts as a mini-cache of
160 contextual information required for resolving the level of the
161 current character by various UBA rules.
162
163 Caching of bidi iterator states
164 -------------------------------
165
166 As described above, the reordering engine uses the information
167 recorded in the bidi iterator state in order to resolve the
168 embedding level of the current character. When the reordering
169 engine needs to process the next character in the logical order, it
170 fetches it and applies to it all the UBA levels, updating the
171 iterator state as it goes. But when the buffer or string is
172 scanned backwards, i.e. in the reverse order of buffer/string
173 positions, the scanned characters were already processed during the
174 preceding forward scan (see bidi_find_other_level_edge). To avoid
175 costly re-processing of characters that were already processed
176 during the forward scan, the iterator states computed while
177 scanning forward are cached.
178
179 The cache is just a linear array of 'struct bidi_it' objects, which
180 is dynamically allocated and reallocated as needed, since the size
181 of the cache depends on the text being processed. We only need the
182 cache while processing embedded levels higher than the base
183 paragraph embedding level, because these higher levels require
184 changes in scan direction. Therefore, as soon as we are back to
185 the base embedding level, we can free the cache; see the calls to
186 bidi_cache_reset and bidi_cache_shrink, for the conditions to do
187 this.
188
189 The cache maintains the index of the next unused cache slot -- this
190 is where the next iterator state will be cached. The function
191 bidi_cache_iterator_state saves an instance of the state in the
192 cache and increments the unused slot index. The companion function
193 bidi_cache_find looks up a cached state that corresponds to a given
194 buffer/string position. All of the cached states must correspond
195 1:1 to the buffer or string region whose processing they reflect;
196 bidi.c will abort if it finds cache slots that violate this 1:1
197 correspondence.
198
199 When the parent iterator 'struct it' is pushed (see push_it in
200 xdisp.c) to pause the current iteration and start iterating over a
201 different object (e.g., a 'display' string that covers some buffer
202 text), the bidi iterator cache needs to be "pushed" as well, so
203 that a new empty cache could be used while iterating over the new
204 object. Later, when the new object is exhausted, and xdisp.c calls
205 pop_it, we need to "pop" the bidi cache as well and return to the
206 original cache. See bidi_push_it and bidi_pop_it for how this is
207 done.
208
209 Some functions of the display engine save copies of 'struct it' in
210 local variables, and restore them later. For examples, see
211 pos_visible_p and move_it_in_display_line_to in xdisp.c, and
212 window_scroll_pixel_based in window.c. When this happens, we need
213 to save and restore the bidi cache as well, because conceptually
214 the cache is part of the 'struct it' state, and needs to be in
215 perfect sync with the portion of the buffer/string that is being
216 processed. This saving and restoring of the cache state is handled
217 by bidi_shelve_cache and bidi_unshelve_cache, and the helper macros
218 SAVE_IT and RESTORE_IT defined on xdisp.c.
219
220 Note that, because reordering is implemented below the level in
221 xdisp.c that breaks glyphs into screen lines, we are violating
222 paragraph 3.4 of UAX#9. which mandates that line breaking shall be
223 done before reordering each screen line separately. However,
224 following UAX#9 to the letter in this matter goes against the basic
225 design of the Emacs display engine, and so we choose here this
226 minor deviation from the UBA letter in preference to redesign of
227 the display engine. The effect of this is only seen in continued
228 lines that are broken into screen lines in the middle of a run
229 whose direction is opposite to the paragraph's base direction.
230
231 Important design and implementation note: when the code needs to
232 scan far ahead, be sure to avoid such scans as much as possible
233 when the buffer/string doesn't contain any RTL characters. Users
234 of left-to-right scripts will never forgive you if you introduce
235 some slow-down due to bidi in situations that don't involve any
236 bidirectional text. See the large comment near the beginning of
237 bidi_resolve_neutral, for one situation where such shortcut was
238 necessary. */
239
240 #include <config.h>
241 #include <stdio.h>
242
243 #include "lisp.h"
244 #include "character.h"
245 #include "buffer.h"
246 #include "dispextern.h"
247 #include "region-cache.h"
248
249 static bool bidi_initialized = 0;
250
251 static Lisp_Object bidi_type_table, bidi_mirror_table, bidi_brackets_table;
252
253 #define BIDI_EOB (-1)
254
255 /* Data type for describing the bidirectional character categories. */
256 typedef enum {
257 UNKNOWN_BC,
258 NEUTRAL,
259 WEAK,
260 STRONG,
261 EXPLICIT_FORMATTING
262 } bidi_category_t;
263
264 static Lisp_Object paragraph_start_re, paragraph_separate_re;
265 static Lisp_Object Qparagraph_start, Qparagraph_separate;
266
267 \f
268 /***********************************************************************
269 Utilities
270 ***********************************************************************/
271
272 /* Return the bidi type of a character CH, subject to the current
273 directional OVERRIDE. */
274 static bidi_type_t
275 bidi_get_type (int ch, bidi_dir_t override)
276 {
277 bidi_type_t default_type;
278
279 if (ch == BIDI_EOB)
280 return NEUTRAL_B;
281 if (ch < 0 || ch > MAX_CHAR)
282 emacs_abort ();
283
284 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
285 /* Every valid character code, even those that are unassigned by the
286 UCD, have some bidi-class property, according to
287 DerivedBidiClass.txt file. Therefore, if we ever get UNKNOWN_BT
288 (= zero) code from CHAR_TABLE_REF, that's a bug. */
289 if (default_type == UNKNOWN_BT)
290 emacs_abort ();
291
292 switch (default_type)
293 {
294 case WEAK_BN:
295 case NEUTRAL_B:
296 case LRE:
297 case LRO:
298 case RLE:
299 case RLO:
300 case PDF:
301 case LRI:
302 case RLI:
303 case FSI:
304 case PDI:
305 return default_type;
306 default:
307 if (override == L2R)
308 return STRONG_L;
309 else if (override == R2L)
310 return STRONG_R;
311 else
312 return default_type;
313 }
314 }
315
316 static void
317 bidi_check_type (bidi_type_t type)
318 {
319 eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
320 }
321
322 /* Given a bidi TYPE of a character, return its category. */
323 static bidi_category_t
324 bidi_get_category (bidi_type_t type)
325 {
326 switch (type)
327 {
328 case UNKNOWN_BT:
329 return UNKNOWN_BC;
330 case STRONG_L:
331 case STRONG_R:
332 case STRONG_AL:
333 return STRONG;
334 case WEAK_EN:
335 case WEAK_ES:
336 case WEAK_ET:
337 case WEAK_AN:
338 case WEAK_CS:
339 case WEAK_NSM:
340 case WEAK_BN:
341 return WEAK;
342 case NEUTRAL_B:
343 case NEUTRAL_S:
344 case NEUTRAL_WS:
345 case NEUTRAL_ON:
346 return NEUTRAL;
347 case LRE:
348 case LRO:
349 case RLE:
350 case RLO:
351 case PDF:
352 case LRI:
353 case RLI:
354 case FSI:
355 case PDI:
356 return EXPLICIT_FORMATTING;
357 default:
358 emacs_abort ();
359 }
360 }
361
362 static bool
363 bidi_isolate_fmt_char (bidi_type_t ch_type)
364 {
365 return (ch_type == LRI || ch_type == RLI || ch_type == PDI || ch_type == FSI);
366 }
367
368 /* Return the mirrored character of C, if it has one. If C has no
369 mirrored counterpart, return C.
370 Note: The conditions in UAX#9 clause L4 regarding the surrounding
371 context must be tested by the caller. */
372 int
373 bidi_mirror_char (int c)
374 {
375 Lisp_Object val;
376
377 if (c == BIDI_EOB)
378 return c;
379 if (c < 0 || c > MAX_CHAR)
380 emacs_abort ();
381
382 val = CHAR_TABLE_REF (bidi_mirror_table, c);
383 if (INTEGERP (val))
384 {
385 int v;
386
387 /* When debugging, check before assigning to V, so that the check
388 isn't broken by undefined behavior due to int overflow. */
389 eassert (CHAR_VALID_P (XINT (val)));
390
391 v = XINT (val);
392
393 /* Minimal test we must do in optimized builds, to prevent weird
394 crashes further down the road. */
395 if (v < 0 || v > MAX_CHAR)
396 emacs_abort ();
397
398 return v;
399 }
400
401 return c;
402 }
403
404 /* Return the Bidi_Paired_Bracket_Type property of the character C. */
405 static bidi_bracket_type_t
406 bidi_paired_bracket_type (int c)
407 {
408 if (c == BIDI_EOB)
409 return BIDI_BRACKET_NONE;
410 if (c < 0 || c > MAX_CHAR)
411 emacs_abort ();
412
413 return (bidi_bracket_type_t) XINT (CHAR_TABLE_REF (bidi_brackets_table, c));
414 }
415
416 /* Determine the start-of-sequence (sos) directional type given the two
417 embedding levels on either side of the run boundary. Also, update
418 the saved info about previously seen characters, since that info is
419 generally valid for a single level run. */
420 static void
421 bidi_set_sos_type (struct bidi_it *bidi_it, int level_before, int level_after)
422 {
423 int higher_level = (level_before > level_after ? level_before : level_after);
424
425 /* FIXME: should the default sos direction be user selectable? */
426 bidi_it->sos = ((higher_level & 1) != 0 ? R2L : L2R); /* X10 */
427
428 bidi_it->prev.type = UNKNOWN_BT;
429 bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
430 bidi_it->prev_for_neutral.type = (bidi_it->sos == R2L ? STRONG_R : STRONG_L);
431 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
432 bidi_it->next_for_neutral.type
433 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
434 }
435
436 /* Push the current embedding level and override status; reset the
437 current level to LEVEL and the current override status to OVERRIDE. */
438 static void
439 bidi_push_embedding_level (struct bidi_it *bidi_it,
440 int level, bidi_dir_t override, bool isolate_status)
441 {
442 struct bidi_stack *st;
443 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
444
445 bidi_it->stack_idx++;
446 eassert (bidi_it->stack_idx < BIDI_MAXDEPTH+2+1);
447 st = &bidi_it->level_stack[bidi_it->stack_idx];
448 eassert (level <= (1 << 7));
449 st->level = level;
450 st->override = override;
451 st->isolate_status = isolate_status;
452 if (isolate_status)
453 {
454 st->last_strong = bidi_it->last_strong;
455 st->prev_for_neutral = bidi_it->prev_for_neutral;
456 st->next_for_neutral = bidi_it->next_for_neutral;
457 st->sos = bidi_it->sos;
458 }
459 /* We've got a new isolating sequence, compute the directional type
460 of sos and initialize per-sequence variables (UAX#9, clause X10). */
461 bidi_set_sos_type (bidi_it, prev_level, level);
462 }
463
464 /* Pop from the stack the embedding level, the directional override
465 status, and optionally saved information for the isolating run
466 sequence. Return the new level. */
467 static int
468 bidi_pop_embedding_level (struct bidi_it *bidi_it)
469 {
470 int level;
471
472 /* UAX#9 says to ignore invalid PDFs (X7, last bullet)
473 and PDIs (X6a, 2nd bullet). */
474 if (bidi_it->stack_idx > 0)
475 {
476 bool isolate_status
477 = bidi_it->level_stack[bidi_it->stack_idx].isolate_status;
478 int old_level = bidi_it->level_stack[bidi_it->stack_idx].level;
479
480 struct bidi_stack st;
481
482 st = bidi_it->level_stack[bidi_it->stack_idx];
483 if (isolate_status)
484 {
485 /* PREV is used in W1 for resolving WEAK_NSM. By the time
486 we get to an NSM, we must have gotten past at least one
487 character: the PDI that ends the isolate from which we
488 are popping here. So PREV will have been filled up by
489 the time we first use it. We initialize it here to
490 UNKNOWN_BT to be able to catch any blunders in this
491 logic. */
492 bidi_it->prev.orig_type = bidi_it->prev.type = UNKNOWN_BT;
493 bidi_it->last_strong = st.last_strong;
494 bidi_it->prev_for_neutral = st.prev_for_neutral;
495 bidi_it->next_for_neutral = st.next_for_neutral;
496 bidi_it->sos = st.sos;
497 }
498 else
499 bidi_set_sos_type (bidi_it, old_level,
500 bidi_it->level_stack[bidi_it->stack_idx - 1].level);
501
502 bidi_it->stack_idx--;
503 }
504 level = bidi_it->level_stack[bidi_it->stack_idx].level;
505 eassert (0 <= level && level <= BIDI_MAXDEPTH + 1);
506 return level;
507 }
508
509 /* Record in SAVED_INFO the information about the current character. */
510 static void
511 bidi_remember_char (struct bidi_saved_info *saved_info,
512 struct bidi_it *bidi_it, bool from_type)
513 {
514 saved_info->charpos = bidi_it->charpos;
515 if (from_type)
516 saved_info->type = bidi_it->type;
517 else
518 saved_info->type = bidi_it->type_after_wn;
519 bidi_check_type (saved_info->type);
520 saved_info->orig_type = bidi_it->orig_type;
521 bidi_check_type (saved_info->orig_type);
522 }
523
524 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
525 copies the part of the level stack that is actually in use. */
526 static void
527 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
528 {
529 /* Copy everything from the start through the active part of
530 the level stack. */
531 memcpy (to, from,
532 (offsetof (struct bidi_it, level_stack[1])
533 + from->stack_idx * sizeof from->level_stack[0]));
534 }
535
536 \f
537 /***********************************************************************
538 Caching the bidi iterator states
539 ***********************************************************************/
540
541 /* We allocate and de-allocate the cache in chunks of this size (in
542 characters). 200 was chosen as an upper limit for reasonably-long
543 lines in a text file/buffer. */
544 #define BIDI_CACHE_CHUNK 200
545 static struct bidi_it *bidi_cache;
546 static ptrdiff_t bidi_cache_size = 0;
547 enum { elsz = sizeof (struct bidi_it) };
548 static ptrdiff_t bidi_cache_idx; /* next unused cache slot */
549 static ptrdiff_t bidi_cache_last_idx; /* slot of last cache hit */
550 static ptrdiff_t bidi_cache_start = 0; /* start of cache for this
551 "stack" level */
552
553 /* 5-slot stack for saving the start of the previous level of the
554 cache. xdisp.c maintains a 5-slot stack for its iterator state,
555 and we need the same size of our stack. */
556 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
557 static int bidi_cache_sp;
558
559 /* Size of header used by bidi_shelve_cache. */
560 enum
561 {
562 bidi_shelve_header_size
563 = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
564 + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
565 + sizeof (bidi_cache_last_idx))
566 };
567
568 /* Reset the cache state to the empty state. We only reset the part
569 of the cache relevant to iteration of the current object. Previous
570 objects, which are pushed on the display iterator's stack, are left
571 intact. This is called when the cached information is no more
572 useful for the current iteration, e.g. when we were reseated to a
573 new position on the same object. */
574 static void
575 bidi_cache_reset (void)
576 {
577 bidi_cache_idx = bidi_cache_start;
578 bidi_cache_last_idx = -1;
579 }
580
581 /* Shrink the cache to its minimal size. Called when we init the bidi
582 iterator for reordering a buffer or a string that does not come
583 from display properties, because that means all the previously
584 cached info is of no further use. */
585 static void
586 bidi_cache_shrink (void)
587 {
588 if (bidi_cache_size > BIDI_CACHE_CHUNK)
589 {
590 bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
591 bidi_cache_size = BIDI_CACHE_CHUNK;
592 }
593 bidi_cache_reset ();
594 }
595
596 static void
597 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
598 {
599 int current_scan_dir = bidi_it->scan_dir;
600
601 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
602 emacs_abort ();
603
604 bidi_copy_it (bidi_it, &bidi_cache[idx]);
605 bidi_it->scan_dir = current_scan_dir;
606 bidi_cache_last_idx = idx;
607 }
608
609 /* Find a cached state with a given CHARPOS and resolved embedding
610 level less or equal to LEVEL. If LEVEL is -1, disregard the
611 resolved levels in cached states. DIR, if non-zero, means search
612 in that direction from the last cache hit. */
613 static ptrdiff_t
614 bidi_cache_search (ptrdiff_t charpos, int level, int dir)
615 {
616 ptrdiff_t i, i_start;
617
618 if (bidi_cache_idx > bidi_cache_start)
619 {
620 if (bidi_cache_last_idx == -1)
621 bidi_cache_last_idx = bidi_cache_idx - 1;
622 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
623 {
624 dir = -1;
625 i_start = bidi_cache_last_idx - 1;
626 }
627 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
628 + bidi_cache[bidi_cache_last_idx].nchars - 1))
629 {
630 dir = 1;
631 i_start = bidi_cache_last_idx + 1;
632 }
633 else if (dir)
634 i_start = bidi_cache_last_idx;
635 else
636 {
637 dir = -1;
638 i_start = bidi_cache_idx - 1;
639 }
640
641 if (dir < 0)
642 {
643 /* Linear search for now; FIXME! */
644 for (i = i_start; i >= bidi_cache_start; i--)
645 if (bidi_cache[i].charpos <= charpos
646 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
647 && (level == -1 || bidi_cache[i].resolved_level <= level))
648 return i;
649 }
650 else
651 {
652 for (i = i_start; i < bidi_cache_idx; i++)
653 if (bidi_cache[i].charpos <= charpos
654 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
655 && (level == -1 || bidi_cache[i].resolved_level <= level))
656 return i;
657 }
658 }
659
660 return -1;
661 }
662
663 /* Find a cached state where the resolved level changes to a value
664 that is lower than LEVEL, and return its cache slot index. DIR is
665 the direction to search, starting with the last used cache slot.
666 If DIR is zero, we search backwards from the last occupied cache
667 slot. BEFORE means return the index of the slot that
668 is ``before'' the level change in the search direction. That is,
669 given the cached levels like this:
670
671 1122333442211
672 AB C
673
674 and assuming we are at the position cached at the slot marked with
675 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
676 index of slot B or A, depending whether BEFORE is, respectively,
677 true or false. */
678 static ptrdiff_t
679 bidi_cache_find_level_change (int level, int dir, bool before)
680 {
681 if (bidi_cache_idx)
682 {
683 ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
684 int incr = before ? 1 : 0;
685
686 eassert (!dir || bidi_cache_last_idx >= 0);
687
688 if (!dir)
689 dir = -1;
690 else if (!incr)
691 i += dir;
692
693 if (dir < 0)
694 {
695 while (i >= bidi_cache_start + incr)
696 {
697 if (bidi_cache[i - incr].resolved_level >= 0
698 && bidi_cache[i - incr].resolved_level < level)
699 return i;
700 i--;
701 }
702 }
703 else
704 {
705 while (i < bidi_cache_idx - incr)
706 {
707 if (bidi_cache[i + incr].resolved_level >= 0
708 && bidi_cache[i + incr].resolved_level < level)
709 return i;
710 i++;
711 }
712 }
713 }
714
715 return -1;
716 }
717
718 static void
719 bidi_cache_ensure_space (ptrdiff_t idx)
720 {
721 /* Enlarge the cache as needed. */
722 if (idx >= bidi_cache_size)
723 {
724 /* The bidi cache cannot be larger than the largest Lisp string
725 or buffer. */
726 ptrdiff_t string_or_buffer_bound
727 = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
728
729 /* Also, it cannot be larger than what C can represent. */
730 ptrdiff_t c_bound
731 = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
732
733 bidi_cache
734 = xpalloc (bidi_cache, &bidi_cache_size,
735 max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
736 min (string_or_buffer_bound, c_bound), elsz);
737 }
738 }
739
740 static void
741 bidi_cache_iterator_state (struct bidi_it *bidi_it, bool resolved,
742 bool update_only)
743 {
744 ptrdiff_t idx;
745
746 /* We should never cache on backward scans. */
747 if (bidi_it->scan_dir == -1)
748 emacs_abort ();
749 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
750
751 if (idx < 0 && update_only)
752 return;
753
754 if (idx < 0)
755 {
756 idx = bidi_cache_idx;
757 bidi_cache_ensure_space (idx);
758 /* Character positions should correspond to cache positions 1:1.
759 If we are outside the range of cached positions, the cache is
760 useless and must be reset. */
761 if (idx > bidi_cache_start &&
762 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
763 + bidi_cache[idx - 1].nchars)
764 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
765 {
766 bidi_cache_reset ();
767 idx = bidi_cache_start;
768 }
769 if (bidi_it->nchars <= 0)
770 emacs_abort ();
771 bidi_copy_it (&bidi_cache[idx], bidi_it);
772 if (!resolved)
773 bidi_cache[idx].resolved_level = -1;
774 }
775 else
776 {
777 /* Copy only the members which could have changed, to avoid
778 costly copying of the entire struct. */
779 bidi_cache[idx].type = bidi_it->type;
780 bidi_check_type (bidi_it->type);
781 bidi_cache[idx].type_after_wn = bidi_it->type_after_wn;
782 bidi_check_type (bidi_it->type_after_wn);
783 if (resolved)
784 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
785 else
786 bidi_cache[idx].resolved_level = -1;
787 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
788 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
789 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
790 bidi_cache[idx].disp_pos = bidi_it->disp_pos;
791 bidi_cache[idx].disp_prop = bidi_it->disp_prop;
792 bidi_cache[idx].bracket_pairing_pos = bidi_it->bracket_pairing_pos;
793 bidi_cache[idx].bracket_enclosed_type = bidi_it->bracket_enclosed_type;
794 }
795
796 bidi_cache_last_idx = idx;
797 if (idx >= bidi_cache_idx)
798 bidi_cache_idx = idx + 1;
799 }
800
801 /* Look for a cached iterator state that corresponds to CHARPOS. If
802 found, copy the cached state into BIDI_IT and return the type of
803 the cached entry. If not found, return UNKNOWN_BT. RESOLVED_ONLY
804 zero means it is OK to return cached states that were not fully
805 resolved yet. This can happen if the state was cached before it
806 was resolved in bidi_resolve_neutral. */
807 static bidi_type_t
808 bidi_cache_find (ptrdiff_t charpos, bool resolved_only, struct bidi_it *bidi_it)
809 {
810 ptrdiff_t i = bidi_cache_search (charpos, -1, bidi_it->scan_dir);
811
812 if (i >= bidi_cache_start
813 && (!resolved_only
814 /* Callers that want only fully resolved states (and set
815 resolved_only = true) need to be sure that there's enough
816 info in the cached state to return the state as final,
817 and if not, they don't want the cached state. */
818 || bidi_cache[i].resolved_level >= 0))
819 {
820 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
821
822 bidi_copy_it (bidi_it, &bidi_cache[i]);
823 bidi_cache_last_idx = i;
824 /* Don't let scan direction from the cached state override
825 the current scan direction. */
826 bidi_it->scan_dir = current_scan_dir;
827 return bidi_it->type;
828 }
829
830 return UNKNOWN_BT;
831 }
832
833 static int
834 bidi_peek_at_next_level (struct bidi_it *bidi_it)
835 {
836 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
837 emacs_abort ();
838 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
839 }
840
841 \f
842 /***********************************************************************
843 Pushing and popping the bidi iterator state
844 ***********************************************************************/
845
846 /* Push the bidi iterator state in preparation for reordering a
847 different object, e.g. display string found at certain buffer
848 position. Pushing the bidi iterator boils down to saving its
849 entire state on the cache and starting a new cache "stacked" on top
850 of the current cache. */
851 void
852 bidi_push_it (struct bidi_it *bidi_it)
853 {
854 /* Save the current iterator state in its entirety after the last
855 used cache slot. */
856 bidi_cache_ensure_space (bidi_cache_idx);
857 bidi_cache[bidi_cache_idx++] = *bidi_it;
858
859 /* Push the current cache start onto the stack. */
860 eassert (bidi_cache_sp < IT_STACK_SIZE);
861 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
862
863 /* Start a new level of cache, and make it empty. */
864 bidi_cache_start = bidi_cache_idx;
865 bidi_cache_last_idx = -1;
866 }
867
868 /* Restore the iterator state saved by bidi_push_it and return the
869 cache to the corresponding state. */
870 void
871 bidi_pop_it (struct bidi_it *bidi_it)
872 {
873 if (bidi_cache_start <= 0)
874 emacs_abort ();
875
876 /* Reset the next free cache slot index to what it was before the
877 call to bidi_push_it. */
878 bidi_cache_idx = bidi_cache_start - 1;
879
880 /* Restore the bidi iterator state saved in the cache. */
881 *bidi_it = bidi_cache[bidi_cache_idx];
882
883 /* Pop the previous cache start from the stack. */
884 if (bidi_cache_sp <= 0)
885 emacs_abort ();
886 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
887
888 /* Invalidate the last-used cache slot data. */
889 bidi_cache_last_idx = -1;
890 }
891
892 static ptrdiff_t bidi_cache_total_alloc;
893
894 /* Stash away a copy of the cache and its control variables. */
895 void *
896 bidi_shelve_cache (void)
897 {
898 unsigned char *databuf;
899 ptrdiff_t alloc;
900
901 /* Empty cache. */
902 if (bidi_cache_idx == 0)
903 return NULL;
904
905 alloc = (bidi_shelve_header_size
906 + bidi_cache_idx * sizeof (struct bidi_it));
907 databuf = xmalloc (alloc);
908 bidi_cache_total_alloc += alloc;
909
910 memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
911 memcpy (databuf + sizeof (bidi_cache_idx),
912 bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
913 memcpy (databuf + sizeof (bidi_cache_idx)
914 + bidi_cache_idx * sizeof (struct bidi_it),
915 bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
916 memcpy (databuf + sizeof (bidi_cache_idx)
917 + bidi_cache_idx * sizeof (struct bidi_it)
918 + sizeof (bidi_cache_start_stack),
919 &bidi_cache_sp, sizeof (bidi_cache_sp));
920 memcpy (databuf + sizeof (bidi_cache_idx)
921 + bidi_cache_idx * sizeof (struct bidi_it)
922 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
923 &bidi_cache_start, sizeof (bidi_cache_start));
924 memcpy (databuf + sizeof (bidi_cache_idx)
925 + bidi_cache_idx * sizeof (struct bidi_it)
926 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
927 + sizeof (bidi_cache_start),
928 &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
929
930 return databuf;
931 }
932
933 /* Restore the cache state from a copy stashed away by
934 bidi_shelve_cache, and free the buffer used to stash that copy.
935 JUST_FREE means free the buffer, but don't restore the
936 cache; used when the corresponding iterator is discarded instead of
937 being restored. */
938 void
939 bidi_unshelve_cache (void *databuf, bool just_free)
940 {
941 unsigned char *p = databuf;
942
943 if (!p)
944 {
945 if (!just_free)
946 {
947 /* A NULL pointer means an empty cache. */
948 bidi_cache_start = 0;
949 bidi_cache_sp = 0;
950 bidi_cache_reset ();
951 }
952 }
953 else
954 {
955 if (just_free)
956 {
957 ptrdiff_t idx;
958
959 memcpy (&idx, p, sizeof (bidi_cache_idx));
960 bidi_cache_total_alloc
961 -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
962 }
963 else
964 {
965 memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
966 bidi_cache_ensure_space (bidi_cache_idx);
967 memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
968 bidi_cache_idx * sizeof (struct bidi_it));
969 memcpy (bidi_cache_start_stack,
970 p + sizeof (bidi_cache_idx)
971 + bidi_cache_idx * sizeof (struct bidi_it),
972 sizeof (bidi_cache_start_stack));
973 memcpy (&bidi_cache_sp,
974 p + sizeof (bidi_cache_idx)
975 + bidi_cache_idx * sizeof (struct bidi_it)
976 + sizeof (bidi_cache_start_stack),
977 sizeof (bidi_cache_sp));
978 memcpy (&bidi_cache_start,
979 p + sizeof (bidi_cache_idx)
980 + bidi_cache_idx * sizeof (struct bidi_it)
981 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
982 sizeof (bidi_cache_start));
983 memcpy (&bidi_cache_last_idx,
984 p + sizeof (bidi_cache_idx)
985 + bidi_cache_idx * sizeof (struct bidi_it)
986 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
987 + sizeof (bidi_cache_start),
988 sizeof (bidi_cache_last_idx));
989 bidi_cache_total_alloc
990 -= (bidi_shelve_header_size
991 + bidi_cache_idx * sizeof (struct bidi_it));
992 }
993
994 xfree (p);
995 }
996 }
997
998 \f
999 /***********************************************************************
1000 Initialization
1001 ***********************************************************************/
1002 static void
1003 bidi_initialize (void)
1004 {
1005 bidi_type_table = uniprop_table (intern ("bidi-class"));
1006 if (NILP (bidi_type_table))
1007 emacs_abort ();
1008 staticpro (&bidi_type_table);
1009
1010 bidi_mirror_table = uniprop_table (intern ("mirroring"));
1011 if (NILP (bidi_mirror_table))
1012 emacs_abort ();
1013 staticpro (&bidi_mirror_table);
1014
1015 bidi_brackets_table = uniprop_table (intern ("bracket-type"));
1016 if (NILP (bidi_brackets_table))
1017 emacs_abort ();
1018 staticpro (&bidi_brackets_table);
1019
1020 Qparagraph_start = intern ("paragraph-start");
1021 staticpro (&Qparagraph_start);
1022 paragraph_start_re = Fsymbol_value (Qparagraph_start);
1023 if (!STRINGP (paragraph_start_re))
1024 paragraph_start_re = build_string ("\f\\|[ \t]*$");
1025 staticpro (&paragraph_start_re);
1026 Qparagraph_separate = intern ("paragraph-separate");
1027 staticpro (&Qparagraph_separate);
1028 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
1029 if (!STRINGP (paragraph_separate_re))
1030 paragraph_separate_re = build_string ("[ \t\f]*$");
1031 staticpro (&paragraph_separate_re);
1032
1033 bidi_cache_sp = 0;
1034 bidi_cache_total_alloc = 0;
1035
1036 bidi_initialized = 1;
1037 }
1038
1039 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
1040 end. */
1041 static void
1042 bidi_set_paragraph_end (struct bidi_it *bidi_it)
1043 {
1044 bidi_it->invalid_levels = 0;
1045 bidi_it->invalid_isolates = 0;
1046 bidi_it->stack_idx = 0;
1047 bidi_it->resolved_level = bidi_it->level_stack[0].level;
1048 }
1049
1050 /* Initialize the bidi iterator from buffer/string position CHARPOS. */
1051 void
1052 bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, bool frame_window_p,
1053 struct bidi_it *bidi_it)
1054 {
1055 if (! bidi_initialized)
1056 bidi_initialize ();
1057 if (charpos >= 0)
1058 bidi_it->charpos = charpos;
1059 if (bytepos >= 0)
1060 bidi_it->bytepos = bytepos;
1061 bidi_it->frame_window_p = frame_window_p;
1062 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit */
1063 bidi_it->first_elt = 1;
1064 bidi_set_paragraph_end (bidi_it);
1065 bidi_it->new_paragraph = 1;
1066 bidi_it->separator_limit = -1;
1067 bidi_it->type = NEUTRAL_B;
1068 bidi_it->type_after_wn = NEUTRAL_B;
1069 bidi_it->orig_type = NEUTRAL_B;
1070 /* FIXME: Review this!!! */
1071 bidi_it->prev.type = bidi_it->prev.orig_type = UNKNOWN_BT;
1072 bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
1073 bidi_it->next_for_neutral.charpos = -1;
1074 bidi_it->next_for_neutral.type
1075 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
1076 bidi_it->prev_for_neutral.charpos = -1;
1077 bidi_it->prev_for_neutral.type
1078 = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
1079 bidi_it->sos = L2R; /* FIXME: should it be user-selectable? */
1080 bidi_it->disp_pos = -1; /* invalid/unknown */
1081 bidi_it->disp_prop = 0;
1082 /* We can only shrink the cache if we are at the bottom level of its
1083 "stack". */
1084 if (bidi_cache_start == 0)
1085 bidi_cache_shrink ();
1086 else
1087 bidi_cache_reset ();
1088 }
1089
1090 /* Perform initializations for reordering a new line of bidi text. */
1091 static void
1092 bidi_line_init (struct bidi_it *bidi_it)
1093 {
1094 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
1095 bidi_it->stack_idx = 0;
1096 bidi_it->resolved_level = bidi_it->level_stack[0].level;
1097 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
1098 bidi_it->level_stack[0].isolate_status = false; /* X1 */
1099 bidi_it->invalid_levels = 0;
1100 bidi_it->isolate_level = 0; /* X1 */
1101 bidi_it->invalid_isolates = 0; /* X1 */
1102 /* Setting this to zero will force its recomputation the first time
1103 we need it for W5. */
1104 bidi_it->next_en_pos = 0;
1105 bidi_it->next_en_type = UNKNOWN_BT;
1106 bidi_it->next_for_ws.charpos = -1;
1107 bidi_it->next_for_ws.type = UNKNOWN_BT;
1108 bidi_set_sos_type (bidi_it,
1109 (bidi_it->paragraph_dir == R2L ? 1 : 0),
1110 bidi_it->level_stack[0].level); /* X10 */
1111
1112 bidi_cache_reset ();
1113 }
1114
1115 \f
1116 /***********************************************************************
1117 Fetching characters
1118 ***********************************************************************/
1119
1120 /* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
1121 are zero-based character positions in S, BEGBYTE is byte position
1122 corresponding to BEG. UNIBYTE means S is a unibyte string. */
1123 static ptrdiff_t
1124 bidi_count_bytes (const unsigned char *s, ptrdiff_t beg,
1125 ptrdiff_t begbyte, ptrdiff_t end, bool unibyte)
1126 {
1127 ptrdiff_t pos = beg;
1128 const unsigned char *p = s + begbyte, *start = p;
1129
1130 if (unibyte)
1131 p = s + end;
1132 else
1133 {
1134 if (!CHAR_HEAD_P (*p))
1135 emacs_abort ();
1136
1137 while (pos < end)
1138 {
1139 p += BYTES_BY_CHAR_HEAD (*p);
1140 pos++;
1141 }
1142 }
1143
1144 return p - start;
1145 }
1146
1147 /* Fetch and return the character at byte position BYTEPOS. If S is
1148 non-NULL, fetch the character from string S; otherwise fetch the
1149 character from the current buffer. UNIBYTE means S is a
1150 unibyte string. */
1151 static int
1152 bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, bool unibyte)
1153 {
1154 if (s)
1155 {
1156 s += bytepos;
1157 if (unibyte)
1158 return *s;
1159 }
1160 else
1161 s = BYTE_POS_ADDR (bytepos);
1162 return STRING_CHAR (s);
1163 }
1164
1165 /* Fetch and return the character at CHARPOS/BYTEPOS. If that
1166 character is covered by a display string, treat the entire run of
1167 covered characters as a single character, either u+2029 or u+FFFC,
1168 and return their combined length in CH_LEN and NCHARS. DISP_POS
1169 specifies the character position of the next display string, or -1
1170 if not yet computed. When the next character is at or beyond that
1171 position, the function updates DISP_POS with the position of the
1172 next display string. *DISP_PROP non-zero means that there's really
1173 a display string at DISP_POS, as opposed to when we searched till
1174 DISP_POS without finding one. If *DISP_PROP is 2, it means the
1175 display spec is of the form `(space ...)', which is replaced with
1176 u+2029 to handle it as a paragraph separator. STRING->s is the C
1177 string to iterate, or NULL if iterating over a buffer or a Lisp
1178 string; in the latter case, STRING->lstring is the Lisp string. */
1179 static int
1180 bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos,
1181 int *disp_prop, struct bidi_string_data *string,
1182 struct window *w,
1183 bool frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
1184 {
1185 int ch;
1186 ptrdiff_t endpos
1187 = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
1188 struct text_pos pos;
1189 int len;
1190
1191 /* If we got past the last known position of display string, compute
1192 the position of the next one. That position could be at CHARPOS. */
1193 if (charpos < endpos && charpos > *disp_pos)
1194 {
1195 SET_TEXT_POS (pos, charpos, bytepos);
1196 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1197 disp_prop);
1198 }
1199
1200 /* Fetch the character at BYTEPOS. */
1201 if (charpos >= endpos)
1202 {
1203 ch = BIDI_EOB;
1204 *ch_len = 1;
1205 *nchars = 1;
1206 *disp_pos = endpos;
1207 *disp_prop = 0;
1208 }
1209 else if (charpos >= *disp_pos && *disp_prop)
1210 {
1211 ptrdiff_t disp_end_pos;
1212
1213 /* We don't expect to find ourselves in the middle of a display
1214 property. Hopefully, it will never be needed. */
1215 if (charpos > *disp_pos)
1216 emacs_abort ();
1217 /* Text covered by `display' properties and overlays with
1218 display properties or display strings is handled as a single
1219 character that represents the entire run of characters
1220 covered by the display property. */
1221 if (*disp_prop == 2)
1222 {
1223 /* `(space ...)' display specs are handled as paragraph
1224 separators for the purposes of the reordering; see UAX#9
1225 section 3 and clause HL1 in section 4.3 there. */
1226 ch = 0x2029;
1227 }
1228 else
1229 {
1230 /* All other display specs are handled as the Unicode Object
1231 Replacement Character. */
1232 ch = 0xFFFC;
1233 }
1234 disp_end_pos = compute_display_string_end (*disp_pos, string);
1235 if (disp_end_pos < 0)
1236 {
1237 /* Somebody removed the display string from the buffer
1238 behind our back. Recover by processing this buffer
1239 position as if no display property were present there to
1240 begin with. */
1241 *disp_prop = 0;
1242 goto normal_char;
1243 }
1244 *nchars = disp_end_pos - *disp_pos;
1245 if (*nchars <= 0)
1246 emacs_abort ();
1247 if (string->s)
1248 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
1249 disp_end_pos, string->unibyte);
1250 else if (STRINGP (string->lstring))
1251 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
1252 bytepos, disp_end_pos, string->unibyte);
1253 else
1254 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
1255 }
1256 else
1257 {
1258 normal_char:
1259 if (string->s)
1260 {
1261
1262 if (!string->unibyte)
1263 {
1264 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1265 *ch_len = len;
1266 }
1267 else
1268 {
1269 ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1270 *ch_len = 1;
1271 }
1272 }
1273 else if (STRINGP (string->lstring))
1274 {
1275 if (!string->unibyte)
1276 {
1277 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1278 len);
1279 *ch_len = len;
1280 }
1281 else
1282 {
1283 ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1284 *ch_len = 1;
1285 }
1286 }
1287 else
1288 {
1289 ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
1290 *ch_len = len;
1291 }
1292 *nchars = 1;
1293 }
1294
1295 /* If we just entered a run of characters covered by a display
1296 string, compute the position of the next display string. */
1297 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
1298 && *disp_prop)
1299 {
1300 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
1301 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1302 disp_prop);
1303 }
1304
1305 return ch;
1306 }
1307
1308 /* Like bidi_fetch_char, but ignore any text between an isolate
1309 initiator and its matching PDI or, if it has no matching PDI, the
1310 end of the paragraph. If isolates were skipped, CH_LEN and NCHARS
1311 are set to the number of bytes and characters between BYTEPOS/CHARPOS
1312 and the character that was fetched after skipping the isolates. */
1313 static int
1314 bidi_fetch_char_skip_isolates (ptrdiff_t charpos, ptrdiff_t bytepos,
1315 ptrdiff_t *disp_pos, int *disp_prop,
1316 struct bidi_string_data *string,
1317 struct window *w, bool frame_window_p,
1318 ptrdiff_t *ch_len, ptrdiff_t *nchars)
1319 {
1320 ptrdiff_t orig_charpos = charpos, orig_bytepos = bytepos;
1321 int ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string, w,
1322 frame_window_p, ch_len, nchars);
1323 bidi_type_t ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1324 ptrdiff_t level = 0;
1325
1326 if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1327 {
1328 level++;
1329 while (level > 0 && ch_type != NEUTRAL_B)
1330 {
1331 charpos += *nchars;
1332 bytepos += *ch_len;
1333 ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string,
1334 w, frame_window_p, ch_len, nchars);
1335 ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1336 /* A Note to P2 says to ignore max_depth limit. */
1337 if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1338 level++;
1339 else if (ch_type == PDI)
1340 level--;
1341 }
1342 }
1343
1344 /* Communicate to the caller how much did we skip, so it could get
1345 past the last character position we examined. */
1346 *nchars += charpos - orig_charpos;
1347 *ch_len += bytepos - orig_bytepos;
1348 return ch;
1349 }
1350
1351
1352 \f
1353 /***********************************************************************
1354 Determining paragraph direction
1355 ***********************************************************************/
1356
1357 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1358 Value is the non-negative length of the paragraph separator
1359 following the buffer position, -1 if position is at the beginning
1360 of a new paragraph, or -2 if position is neither at beginning nor
1361 at end of a paragraph. */
1362 static ptrdiff_t
1363 bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
1364 {
1365 Lisp_Object sep_re;
1366 Lisp_Object start_re;
1367 ptrdiff_t val;
1368
1369 sep_re = paragraph_separate_re;
1370 start_re = paragraph_start_re;
1371
1372 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1373 if (val < 0)
1374 {
1375 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1376 val = -1;
1377 else
1378 val = -2;
1379 }
1380
1381 return val;
1382 }
1383
1384 /* If the user has requested the long scans caching, make sure that
1385 BIDI cache is enabled. Otherwise, make sure it's disabled. */
1386
1387 static struct region_cache *
1388 bidi_paragraph_cache_on_off (void)
1389 {
1390 struct buffer *cache_buffer = current_buffer;
1391 bool indirect_p = false;
1392
1393 /* For indirect buffers, make sure to use the cache of their base
1394 buffer. */
1395 if (cache_buffer->base_buffer)
1396 {
1397 cache_buffer = cache_buffer->base_buffer;
1398 indirect_p = true;
1399 }
1400
1401 /* Don't turn on or off the cache in the base buffer, if the value
1402 of cache-long-scans of the base buffer is inconsistent with that.
1403 This is because doing so will just make the cache pure overhead,
1404 since if we turn it on via indirect buffer, it will be
1405 immediately turned off by its base buffer. */
1406 if (NILP (BVAR (current_buffer, cache_long_scans)))
1407 {
1408 if (!indirect_p
1409 || NILP (BVAR (cache_buffer, cache_long_scans)))
1410 {
1411 if (cache_buffer->bidi_paragraph_cache)
1412 {
1413 free_region_cache (cache_buffer->bidi_paragraph_cache);
1414 cache_buffer->bidi_paragraph_cache = 0;
1415 }
1416 }
1417 return NULL;
1418 }
1419 else
1420 {
1421 if (!indirect_p
1422 || !NILP (BVAR (cache_buffer, cache_long_scans)))
1423 {
1424 if (!cache_buffer->bidi_paragraph_cache)
1425 cache_buffer->bidi_paragraph_cache = new_region_cache ();
1426 }
1427 return cache_buffer->bidi_paragraph_cache;
1428 }
1429 }
1430
1431 /* On my 2005-vintage machine, searching back for paragraph start
1432 takes ~1 ms per line. And bidi_paragraph_init is called 4 times
1433 when user types C-p. The number below limits each call to
1434 bidi_paragraph_init to about 10 ms. */
1435 #define MAX_PARAGRAPH_SEARCH 7500
1436
1437 /* Find the beginning of this paragraph by looking back in the buffer.
1438 Value is the byte position of the paragraph's beginning, or
1439 BEGV_BYTE if paragraph_start_re is still not found after looking
1440 back MAX_PARAGRAPH_SEARCH lines in the buffer. */
1441 static ptrdiff_t
1442 bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
1443 {
1444 Lisp_Object re = paragraph_start_re;
1445 ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
1446 struct region_cache *bpc = bidi_paragraph_cache_on_off ();
1447 ptrdiff_t n = 0, oldpos = pos, next;
1448 struct buffer *cache_buffer = current_buffer;
1449
1450 if (cache_buffer->base_buffer)
1451 cache_buffer = cache_buffer->base_buffer;
1452
1453 while (pos_byte > BEGV_BYTE
1454 && n++ < MAX_PARAGRAPH_SEARCH
1455 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1456 {
1457 /* FIXME: What if the paragraph beginning is covered by a
1458 display string? And what if a display string covering some
1459 of the text over which we scan back includes
1460 paragraph_start_re? */
1461 DEC_BOTH (pos, pos_byte);
1462 if (bpc && region_cache_backward (cache_buffer, bpc, pos, &next))
1463 {
1464 pos = next, pos_byte = CHAR_TO_BYTE (pos);
1465 break;
1466 }
1467 else
1468 pos = find_newline_no_quit (pos, pos_byte, -1, &pos_byte);
1469 }
1470 if (n >= MAX_PARAGRAPH_SEARCH)
1471 pos = BEGV, pos_byte = BEGV_BYTE;
1472 if (bpc)
1473 know_region_cache (cache_buffer, bpc, pos, oldpos);
1474 /* Positions returned by the region cache are not limited to
1475 BEGV..ZV range, so we limit them here. */
1476 pos_byte = clip_to_bounds (BEGV_BYTE, pos_byte, ZV_BYTE);
1477 return pos_byte;
1478 }
1479
1480 /* On a 3.4 GHz machine, searching forward for a strong directional
1481 character in a long paragraph full of weaks or neutrals takes about
1482 1 ms for each 20K characters. The number below limits each call to
1483 bidi_paragraph_init to less than 10 ms even on slow machines. */
1484 #define MAX_STRONG_CHAR_SEARCH 100000
1485
1486 /* Starting from POS, find the first strong (L, R, or AL) character,
1487 while skipping over any characters between an isolate initiator and
1488 its matching PDI. STOP_AT_PDI non-zero means stop at the PDI that
1489 matches the isolate initiator at POS. Return the bidi type of the
1490 character where the search stopped. Give up if after examining
1491 MAX_STRONG_CHAR_SEARCH buffer or string positions no strong
1492 character was found. */
1493 static bidi_type_t
1494 find_first_strong_char (ptrdiff_t pos, ptrdiff_t bytepos, ptrdiff_t end,
1495 ptrdiff_t *disp_pos, int *disp_prop,
1496 struct bidi_string_data *string, struct window *w,
1497 bool string_p, bool frame_window_p,
1498 ptrdiff_t *ch_len, ptrdiff_t *nchars, bool stop_at_pdi)
1499 {
1500 ptrdiff_t pos1;
1501 bidi_type_t type;
1502 int ch;
1503
1504 if (stop_at_pdi)
1505 {
1506 /* If STOP_AT_PDI is non-zero, we must have been called with FSI
1507 at POS. Get past it. */
1508 #ifdef ENABLE_CHECKING
1509 ch = bidi_fetch_char (pos, bytepos, disp_pos, disp_prop, string, w,
1510 frame_window_p, ch_len, nchars);
1511 type = bidi_get_type (ch, NEUTRAL_DIR);
1512 eassert (type == FSI /* || type == LRI || type == RLI */);
1513 #endif
1514 pos += *nchars;
1515 bytepos += *ch_len;
1516 }
1517 ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop, string,
1518 w, frame_window_p, ch_len, nchars);
1519 type = bidi_get_type (ch, NEUTRAL_DIR);
1520
1521 pos1 = pos;
1522 for (pos += *nchars, bytepos += *ch_len;
1523 bidi_get_category (type) != STRONG
1524 /* If requested to stop at first PDI, stop there. */
1525 && !(stop_at_pdi && type == PDI)
1526 /* Stop when searched too far into an abnormally large
1527 paragraph full of weak or neutral characters. */
1528 && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
1529 type = bidi_get_type (ch, NEUTRAL_DIR))
1530 {
1531 if (pos >= end)
1532 {
1533 /* Pretend there's a paragraph separator at end of
1534 buffer/string. */
1535 type = NEUTRAL_B;
1536 break;
1537 }
1538 if (!string_p
1539 && type == NEUTRAL_B
1540 && bidi_at_paragraph_end (pos, bytepos) >= -1)
1541 break;
1542 /* Fetch next character and advance to get past it. */
1543 ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop,
1544 string, w, frame_window_p,
1545 ch_len, nchars);
1546 pos += *nchars;
1547 bytepos += *ch_len;
1548 }
1549 return type;
1550 }
1551
1552 /* Determine the base direction, a.k.a. base embedding level, of the
1553 paragraph we are about to iterate through. If DIR is either L2R or
1554 R2L, just use that. Otherwise, determine the paragraph direction
1555 from the first strong directional character of the paragraph.
1556
1557 NO_DEFAULT_P means don't default to L2R if the paragraph
1558 has no strong directional characters and both DIR and
1559 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1560 in the buffer until a paragraph is found with a strong character,
1561 or until hitting BEGV. In the latter case, fall back to L2R. This
1562 flag is used in current-bidi-paragraph-direction.
1563
1564 Note that this function gives the paragraph separator the same
1565 direction as the preceding paragraph, even though Emacs generally
1566 views the separator as not belonging to any paragraph. */
1567 void
1568 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, bool no_default_p)
1569 {
1570 ptrdiff_t bytepos = bidi_it->bytepos;
1571 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1572 ptrdiff_t pstartbyte;
1573 /* Note that begbyte is a byte position, while end is a character
1574 position. Yes, this is ugly, but we are trying to avoid costly
1575 calls to BYTE_TO_CHAR and its ilk. */
1576 ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1577 ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
1578
1579 /* Special case for an empty buffer. */
1580 if (bytepos == begbyte && bidi_it->charpos == end)
1581 dir = L2R;
1582 /* We should never be called at EOB or before BEGV. */
1583 else if (bidi_it->charpos >= end || bytepos < begbyte)
1584 emacs_abort ();
1585
1586 if (dir == L2R)
1587 {
1588 bidi_it->paragraph_dir = L2R;
1589 bidi_it->new_paragraph = 0;
1590 }
1591 else if (dir == R2L)
1592 {
1593 bidi_it->paragraph_dir = R2L;
1594 bidi_it->new_paragraph = 0;
1595 }
1596 else if (dir == NEUTRAL_DIR) /* P2 */
1597 {
1598 ptrdiff_t ch_len, nchars;
1599 ptrdiff_t pos, disp_pos = -1;
1600 int disp_prop = 0;
1601 bidi_type_t type;
1602 const unsigned char *s;
1603
1604 if (!bidi_initialized)
1605 bidi_initialize ();
1606
1607 /* If we are inside a paragraph separator, we are just waiting
1608 for the separator to be exhausted; use the previous paragraph
1609 direction. But don't do that if we have been just reseated,
1610 because we need to reinitialize below in that case. */
1611 if (!bidi_it->first_elt
1612 && bidi_it->charpos < bidi_it->separator_limit)
1613 return;
1614
1615 /* If we are on a newline, get past it to where the next
1616 paragraph might start. But don't do that at BEGV since then
1617 we are potentially in a new paragraph that doesn't yet
1618 exist. */
1619 pos = bidi_it->charpos;
1620 s = (STRINGP (bidi_it->string.lstring)
1621 ? SDATA (bidi_it->string.lstring)
1622 : bidi_it->string.s);
1623 if (bytepos > begbyte
1624 && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1625 {
1626 bytepos++;
1627 pos++;
1628 }
1629
1630 /* We are either at the beginning of a paragraph or in the
1631 middle of it. Find where this paragraph starts. */
1632 if (string_p)
1633 {
1634 /* We don't support changes of paragraph direction inside a
1635 string. It is treated as a single paragraph. */
1636 pstartbyte = 0;
1637 }
1638 else
1639 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1640 bidi_it->separator_limit = -1;
1641 bidi_it->new_paragraph = 0;
1642
1643 /* The following loop is run more than once only if NO_DEFAULT_P,
1644 and only if we are iterating on a buffer. */
1645 do {
1646 bytepos = pstartbyte;
1647 if (!string_p)
1648 pos = BYTE_TO_CHAR (bytepos);
1649 type = find_first_strong_char (pos, bytepos, end, &disp_pos, &disp_prop,
1650 &bidi_it->string, bidi_it->w,
1651 string_p, bidi_it->frame_window_p,
1652 &ch_len, &nchars, false);
1653 if (type == STRONG_R || type == STRONG_AL) /* P3 */
1654 bidi_it->paragraph_dir = R2L;
1655 else if (type == STRONG_L)
1656 bidi_it->paragraph_dir = L2R;
1657 if (!string_p
1658 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1659 {
1660 /* If this paragraph is at BEGV, default to L2R. */
1661 if (pstartbyte == BEGV_BYTE)
1662 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1663 else
1664 {
1665 ptrdiff_t prevpbyte = pstartbyte;
1666 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1667
1668 /* Find the beginning of the previous paragraph, if any. */
1669 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1670 {
1671 /* FXIME: What if p is covered by a display
1672 string? See also a FIXME inside
1673 bidi_find_paragraph_start. */
1674 DEC_BOTH (p, pbyte);
1675 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1676 }
1677 pstartbyte = prevpbyte;
1678 }
1679 }
1680 } while (!string_p
1681 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1682 }
1683 else
1684 emacs_abort ();
1685
1686 /* Contrary to UAX#9 clause P3, we only default the paragraph
1687 direction to L2R if we have no previous usable paragraph
1688 direction. This is allowed by the HL1 clause. */
1689 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1690 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1691 if (bidi_it->paragraph_dir == R2L)
1692 bidi_it->level_stack[0].level = 1;
1693 else
1694 bidi_it->level_stack[0].level = 0;
1695
1696 bidi_line_init (bidi_it);
1697 }
1698
1699 \f
1700 /***********************************************************************
1701 Resolving explicit and implicit levels.
1702 The rest of this file constitutes the core of the UBA implementation.
1703 ***********************************************************************/
1704
1705 static bool
1706 bidi_explicit_dir_char (int ch)
1707 {
1708 bidi_type_t ch_type;
1709
1710 if (!bidi_initialized)
1711 emacs_abort ();
1712 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1713 return (ch_type == LRE || ch_type == LRO
1714 || ch_type == RLE || ch_type == RLO
1715 || ch_type == PDF);
1716 }
1717
1718 /* Given an iterator state in BIDI_IT, advance one character position
1719 in the buffer/string to the next character (in the logical order),
1720 resolve any explicit embeddings, directional overrides, and isolate
1721 initiators and terminators, and return the embedding level of the
1722 character after resolving these explicit directives. */
1723 static int
1724 bidi_resolve_explicit (struct bidi_it *bidi_it)
1725 {
1726 int curchar;
1727 bidi_type_t type, typ1, prev_type = UNKNOWN_BT;
1728 int current_level;
1729 int new_level;
1730 bidi_dir_t override;
1731 bool isolate_status;
1732 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1733 ptrdiff_t ch_len, nchars, disp_pos, end;
1734 int disp_prop;
1735
1736 /* Record the info about the previous character. */
1737 if (bidi_it->type_after_wn != WEAK_BN /* W1/Retaining */
1738 && bidi_it->type != WEAK_BN)
1739 {
1740 /* This special case is needed in support of Unicode 8.0
1741 correction to N0, as implemented in bidi_resolve_weak/W1
1742 below. */
1743 if (bidi_it->type_after_wn == NEUTRAL_ON
1744 && bidi_get_category (bidi_it->type) == STRONG
1745 && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
1746 bidi_remember_char (&bidi_it->prev, bidi_it, 1);
1747 else
1748 bidi_remember_char (&bidi_it->prev, bidi_it, 0);
1749 }
1750 if (bidi_it->type_after_wn == STRONG_R
1751 || bidi_it->type_after_wn == STRONG_L
1752 || bidi_it->type_after_wn == STRONG_AL)
1753 bidi_remember_char (&bidi_it->last_strong, bidi_it, 0);
1754 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1755 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1756 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it, 1);
1757
1758 /* If we overstepped the characters used for resolving neutrals
1759 and whitespace, invalidate their info in the iterator. */
1760 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1761 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1762 if (bidi_it->next_en_pos >= 0
1763 && bidi_it->charpos >= bidi_it->next_en_pos)
1764 {
1765 bidi_it->next_en_pos = 0;
1766 bidi_it->next_en_type = UNKNOWN_BT;
1767 }
1768
1769 /* Reset the bracket resolution info. */
1770 bidi_it->bracket_pairing_pos = -1;
1771 bidi_it->bracket_enclosed_type = UNKNOWN_BT;
1772
1773 /* If reseat()'ed, don't advance, so as to start iteration from the
1774 position where we were reseated. bidi_it->bytepos can be less
1775 than BEGV_BYTE after reseat to BEGV. */
1776 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1777 || bidi_it->first_elt)
1778 {
1779 bidi_it->first_elt = 0;
1780 if (string_p)
1781 {
1782 const unsigned char *p
1783 = (STRINGP (bidi_it->string.lstring)
1784 ? SDATA (bidi_it->string.lstring)
1785 : bidi_it->string.s);
1786
1787 if (bidi_it->charpos < 0)
1788 bidi_it->charpos = bidi_it->bytepos = 0;
1789 eassert (bidi_it->bytepos == bidi_count_bytes (p, 0, 0,
1790 bidi_it->charpos,
1791 bidi_it->string.unibyte));
1792 }
1793 else
1794 {
1795 if (bidi_it->charpos < BEGV)
1796 {
1797 bidi_it->charpos = BEGV;
1798 bidi_it->bytepos = BEGV_BYTE;
1799 }
1800 eassert (bidi_it->bytepos == CHAR_TO_BYTE (bidi_it->charpos));
1801 }
1802 /* Determine the original bidi type of the previous character,
1803 which is needed for handling isolate initiators and PDF. The
1804 type of the previous character will be non-trivial only if
1805 our caller moved through some previous text in
1806 get_visually_first_element, in which case bidi_it->prev holds
1807 the information we want. */
1808 if (bidi_it->first_elt && bidi_it->prev.type != UNKNOWN_BT)
1809 {
1810 eassert (bidi_it->prev.charpos == bidi_it->charpos - 1);
1811 prev_type = bidi_it->prev.orig_type;
1812 if (prev_type == FSI)
1813 prev_type = bidi_it->type_after_wn;
1814 }
1815 }
1816 /* Don't move at end of buffer/string. */
1817 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1818 {
1819 /* Advance to the next character, skipping characters covered by
1820 display strings (nchars > 1). */
1821 if (bidi_it->nchars <= 0)
1822 emacs_abort ();
1823 bidi_it->charpos += bidi_it->nchars;
1824 if (bidi_it->ch_len == 0)
1825 emacs_abort ();
1826 bidi_it->bytepos += bidi_it->ch_len;
1827 prev_type = bidi_it->orig_type;
1828 if (prev_type == FSI)
1829 prev_type = bidi_it->type_after_wn;
1830 }
1831 else /* EOB or end of string */
1832 prev_type = NEUTRAL_B;
1833
1834 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1835 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1836 isolate_status = bidi_it->level_stack[bidi_it->stack_idx].isolate_status;
1837 new_level = current_level;
1838
1839 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1840 {
1841 curchar = BIDI_EOB;
1842 bidi_it->ch_len = 1;
1843 bidi_it->nchars = 1;
1844 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1845 bidi_it->disp_prop = 0;
1846 }
1847 else
1848 {
1849 /* LRI, RLI, and FSI increment, and PDF decrements, the
1850 embedding level of the _following_ characters, so we must
1851 first look at the type of the previous character to support
1852 that. */
1853 switch (prev_type)
1854 {
1855 case RLI: /* X5a */
1856 if (current_level < BIDI_MAXDEPTH
1857 && bidi_it->invalid_levels == 0
1858 && bidi_it->invalid_isolates == 0)
1859 {
1860 new_level = ((current_level + 1) & ~1) + 1;
1861 bidi_it->isolate_level++;
1862 bidi_push_embedding_level (bidi_it, new_level,
1863 NEUTRAL_DIR, true);
1864 }
1865 else
1866 bidi_it->invalid_isolates++;
1867 break;
1868 case LRI: /* X5b */
1869 if (current_level < BIDI_MAXDEPTH - 1
1870 && bidi_it->invalid_levels == 0
1871 && bidi_it->invalid_isolates == 0)
1872 {
1873 new_level = ((current_level + 2) & ~1);
1874 bidi_it->isolate_level++;
1875 bidi_push_embedding_level (bidi_it, new_level,
1876 NEUTRAL_DIR, true);
1877 }
1878 else
1879 bidi_it->invalid_isolates++;
1880 break;
1881 case PDF: /* X7 */
1882 if (!bidi_it->invalid_isolates)
1883 {
1884 if (bidi_it->invalid_levels)
1885 bidi_it->invalid_levels--;
1886 else if (!isolate_status && bidi_it->stack_idx >= 1)
1887 new_level = bidi_pop_embedding_level (bidi_it);
1888 }
1889 break;
1890 default:
1891 eassert (prev_type != FSI);
1892 /* Nothing. */
1893 break;
1894 }
1895 /* Fetch the character at BYTEPOS. If it is covered by a
1896 display string, treat the entire run of covered characters as
1897 a single character u+FFFC. */
1898 curchar = bidi_fetch_char (bidi_it->charpos, bidi_it->bytepos,
1899 &bidi_it->disp_pos, &bidi_it->disp_prop,
1900 &bidi_it->string, bidi_it->w,
1901 bidi_it->frame_window_p,
1902 &bidi_it->ch_len, &bidi_it->nchars);
1903 }
1904 bidi_it->ch = curchar;
1905 bidi_it->resolved_level = new_level;
1906
1907 /* Don't apply directional override here, as all the types we handle
1908 below will not be affected by the override anyway, and we need
1909 the original type unaltered. The override will be applied in
1910 bidi_resolve_weak. */
1911 type = bidi_get_type (curchar, NEUTRAL_DIR);
1912 bidi_it->orig_type = type;
1913 bidi_check_type (bidi_it->orig_type);
1914
1915 bidi_it->type_after_wn = UNKNOWN_BT;
1916
1917 switch (type)
1918 {
1919 case RLE: /* X2 */
1920 case RLO: /* X4 */
1921 bidi_it->type_after_wn = type;
1922 bidi_check_type (bidi_it->type_after_wn);
1923 type = WEAK_BN; /* X9/Retaining */
1924 if (new_level < BIDI_MAXDEPTH
1925 && bidi_it->invalid_levels == 0
1926 && bidi_it->invalid_isolates == 0)
1927 {
1928 /* Compute the least odd embedding level greater than
1929 the current level. */
1930 new_level = ((new_level + 1) & ~1) + 1;
1931 if (bidi_it->type_after_wn == RLE)
1932 override = NEUTRAL_DIR;
1933 else
1934 override = R2L;
1935 bidi_push_embedding_level (bidi_it, new_level, override, false);
1936 bidi_it->resolved_level = new_level;
1937 }
1938 else
1939 {
1940 if (bidi_it->invalid_isolates == 0)
1941 bidi_it->invalid_levels++;
1942 }
1943 break;
1944 case LRE: /* X3 */
1945 case LRO: /* X5 */
1946 bidi_it->type_after_wn = type;
1947 bidi_check_type (bidi_it->type_after_wn);
1948 type = WEAK_BN; /* X9/Retaining */
1949 if (new_level < BIDI_MAXDEPTH - 1
1950 && bidi_it->invalid_levels == 0
1951 && bidi_it->invalid_isolates == 0)
1952 {
1953 /* Compute the least even embedding level greater than
1954 the current level. */
1955 new_level = ((new_level + 2) & ~1);
1956 if (bidi_it->type_after_wn == LRE)
1957 override = NEUTRAL_DIR;
1958 else
1959 override = L2R;
1960 bidi_push_embedding_level (bidi_it, new_level, override, false);
1961 bidi_it->resolved_level = new_level;
1962 }
1963 else
1964 {
1965 if (bidi_it->invalid_isolates == 0)
1966 bidi_it->invalid_levels++;
1967 }
1968 break;
1969 case FSI: /* X5c */
1970 end = string_p ? bidi_it->string.schars : ZV;
1971 disp_pos = bidi_it->disp_pos;
1972 disp_prop = bidi_it->disp_prop;
1973 nchars = bidi_it->nchars;
1974 ch_len = bidi_it->ch_len;
1975 typ1 = find_first_strong_char (bidi_it->charpos,
1976 bidi_it->bytepos, end,
1977 &disp_pos, &disp_prop,
1978 &bidi_it->string, bidi_it->w,
1979 string_p, bidi_it->frame_window_p,
1980 &ch_len, &nchars, true);
1981 if (typ1 != STRONG_R && typ1 != STRONG_AL)
1982 {
1983 type = LRI;
1984 goto fsi_as_lri;
1985 }
1986 else
1987 type = RLI;
1988 /* FALLTHROUGH */
1989 case RLI: /* X5a */
1990 if (override == NEUTRAL_DIR)
1991 bidi_it->type_after_wn = type;
1992 else /* Unicode 8.0 correction. */
1993 bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
1994 bidi_check_type (bidi_it->type_after_wn);
1995 break;
1996 case LRI: /* X5b */
1997 fsi_as_lri:
1998 if (override == NEUTRAL_DIR)
1999 bidi_it->type_after_wn = type;
2000 else /* Unicode 8.0 correction. */
2001 bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2002 bidi_check_type (bidi_it->type_after_wn);
2003 break;
2004 case PDI: /* X6a */
2005 if (bidi_it->invalid_isolates)
2006 bidi_it->invalid_isolates--;
2007 else if (bidi_it->isolate_level > 0)
2008 {
2009 bidi_it->invalid_levels = 0;
2010 while (!bidi_it->level_stack[bidi_it->stack_idx].isolate_status)
2011 bidi_pop_embedding_level (bidi_it);
2012 eassert (bidi_it->stack_idx > 0);
2013 new_level = bidi_pop_embedding_level (bidi_it);
2014 bidi_it->isolate_level--;
2015 }
2016 bidi_it->resolved_level = new_level;
2017 /* Unicode 8.0 correction. */
2018 if (bidi_it->level_stack[bidi_it->stack_idx].override == L2R)
2019 bidi_it->type_after_wn = STRONG_L;
2020 else if (bidi_it->level_stack[bidi_it->stack_idx].override == R2L)
2021 bidi_it->type_after_wn = STRONG_R;
2022 else
2023 bidi_it->type_after_wn = type;
2024 break;
2025 case PDF: /* X7 */
2026 bidi_it->type_after_wn = type;
2027 bidi_check_type (bidi_it->type_after_wn);
2028 type = WEAK_BN; /* X9/Retaining */
2029 break;
2030 default:
2031 /* Nothing. */
2032 break;
2033 }
2034
2035 bidi_it->type = type;
2036 bidi_check_type (bidi_it->type);
2037
2038 if (bidi_it->type == NEUTRAL_B) /* X8 */
2039 {
2040 bidi_set_paragraph_end (bidi_it);
2041 /* This is needed by bidi_resolve_weak below, and in L1. */
2042 bidi_it->type_after_wn = bidi_it->type;
2043 }
2044
2045 eassert (bidi_it->resolved_level >= 0);
2046 return bidi_it->resolved_level;
2047 }
2048
2049 /* Advance in the buffer/string, resolve weak types and return the
2050 type of the next character after weak type resolution. */
2051 static bidi_type_t
2052 bidi_resolve_weak (struct bidi_it *bidi_it)
2053 {
2054 bidi_type_t type;
2055 bidi_dir_t override;
2056 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2057 int new_level = bidi_resolve_explicit (bidi_it);
2058 int next_char;
2059 bidi_type_t type_of_next;
2060 struct bidi_it saved_it;
2061 ptrdiff_t eob
2062 = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
2063 ? bidi_it->string.schars : ZV);
2064
2065 type = bidi_it->type;
2066 override = bidi_it->level_stack[bidi_it->stack_idx].override;
2067
2068 eassert (!(type == UNKNOWN_BT
2069 || type == LRE
2070 || type == LRO
2071 || type == RLE
2072 || type == RLO
2073 || type == PDF));
2074
2075 eassert (prev_level >= 0);
2076 if (bidi_it->type == NEUTRAL_B)
2077 {
2078 /* We've got a new isolating sequence, compute the directional
2079 type of sos and initialize per-run variables (UAX#9, clause
2080 X10). */
2081 bidi_set_sos_type (bidi_it, prev_level, new_level);
2082 }
2083 if (type == NEUTRAL_S || type == NEUTRAL_WS
2084 || type == WEAK_BN || type == STRONG_AL)
2085 bidi_it->type_after_wn = type; /* needed in L1 */
2086 bidi_check_type (bidi_it->type_after_wn);
2087
2088 /* Level and directional override status are already recorded in
2089 bidi_it, and do not need any change; see X6. */
2090 if (override == R2L) /* X6 */
2091 type = STRONG_R;
2092 else if (override == L2R)
2093 type = STRONG_L;
2094 else
2095 {
2096 if (type == WEAK_NSM) /* W1 */
2097 {
2098 /* Note that we don't need to consider the case where the
2099 prev character has its type overridden by an RLO or LRO,
2100 because then either the type of this NSM would have been
2101 also overridden, or the previous character is outside the
2102 current level run, and thus not relevant to this NSM.
2103 This is why NSM gets the type_after_wn of the previous
2104 character. */
2105 /* bidi_set_sos_type sets type_after_wn to UNKNOWN_BT. */
2106 if (bidi_it->prev.type != UNKNOWN_BT
2107 /* If type_after_wn is NEUTRAL_B, this NSM is at sos. */
2108 && bidi_it->prev.type != NEUTRAL_B)
2109 {
2110 if (bidi_isolate_fmt_char (bidi_it->prev.type))
2111 {
2112 /* From W1: "Note that in an isolating run sequence,
2113 an isolate initiator followed by an NSM or any
2114 type other than PDI must be an overflow isolate
2115 initiator." */
2116 eassert (bidi_it->invalid_isolates > 0);
2117 type = NEUTRAL_ON;
2118 }
2119 else
2120 {
2121 /* This includes the Unicode 8.0 correction for N0,
2122 due to how we set prev.type in bidi_resolve_explicit,
2123 which see. */
2124 type = bidi_it->prev.type;
2125 }
2126 }
2127 else if (bidi_it->sos == R2L)
2128 type = STRONG_R;
2129 else if (bidi_it->sos == L2R)
2130 type = STRONG_L;
2131 else /* shouldn't happen! */
2132 emacs_abort ();
2133 }
2134 if (type == WEAK_EN /* W2 */
2135 && bidi_it->last_strong.type == STRONG_AL)
2136 type = WEAK_AN;
2137 else if (type == STRONG_AL) /* W3 */
2138 type = STRONG_R;
2139 else if ((type == WEAK_ES /* W4 */
2140 && bidi_it->prev.type == WEAK_EN
2141 && bidi_it->prev.orig_type == WEAK_EN)
2142 || (type == WEAK_CS
2143 && ((bidi_it->prev.type == WEAK_EN
2144 && bidi_it->prev.orig_type == WEAK_EN)
2145 || bidi_it->prev.type == WEAK_AN)))
2146 {
2147 const unsigned char *s
2148 = (STRINGP (bidi_it->string.lstring)
2149 ? SDATA (bidi_it->string.lstring)
2150 : bidi_it->string.s);
2151
2152 next_char = (bidi_it->charpos + bidi_it->nchars >= eob
2153 ? BIDI_EOB
2154 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
2155 s, bidi_it->string.unibyte));
2156 type_of_next = bidi_get_type (next_char, override);
2157
2158 if (type_of_next == WEAK_BN
2159 || bidi_explicit_dir_char (next_char))
2160 {
2161 bidi_copy_it (&saved_it, bidi_it);
2162 while (bidi_resolve_explicit (bidi_it) == new_level
2163 && bidi_it->type == WEAK_BN)
2164 type_of_next = bidi_it->type;
2165 bidi_copy_it (bidi_it, &saved_it);
2166 }
2167
2168 /* If the next character is EN, but the last strong-type
2169 character is AL, that next EN will be changed to AN when
2170 we process it in W2 above. So in that case, this ES
2171 should not be changed into EN. */
2172 if (type == WEAK_ES
2173 && type_of_next == WEAK_EN
2174 && bidi_it->last_strong.type != STRONG_AL)
2175 type = WEAK_EN;
2176 else if (type == WEAK_CS)
2177 {
2178 if (bidi_it->prev.type == WEAK_AN
2179 && (type_of_next == WEAK_AN
2180 /* If the next character is EN, but the last
2181 strong-type character is AL, EN will be later
2182 changed to AN when we process it in W2 above.
2183 So in that case, this ES should not be
2184 changed into EN. */
2185 || (type_of_next == WEAK_EN
2186 && bidi_it->last_strong.type == STRONG_AL)))
2187 type = WEAK_AN;
2188 else if (bidi_it->prev.type == WEAK_EN
2189 && type_of_next == WEAK_EN
2190 && bidi_it->last_strong.type != STRONG_AL)
2191 type = WEAK_EN;
2192 }
2193 }
2194 else if (type == WEAK_ET /* W5: ET with EN before or after it */
2195 || type == WEAK_BN) /* W5/Retaining */
2196 {
2197 if (bidi_it->prev.type == WEAK_EN) /* ET/BN w/EN before it */
2198 type = WEAK_EN;
2199 else if (bidi_it->next_en_pos > bidi_it->charpos
2200 && bidi_it->next_en_type != WEAK_BN)
2201 {
2202 if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
2203 type = WEAK_EN;
2204 }
2205 else if (bidi_it->next_en_pos >=0)
2206 {
2207 /* We overstepped the last known position for ET
2208 resolution but there could be other such characters
2209 in this paragraph (when we are sure there are no more
2210 such positions, we set next_en_pos to a negative
2211 value). Try to find the next position for ET
2212 resolution. */
2213 ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
2214 const unsigned char *s = (STRINGP (bidi_it->string.lstring)
2215 ? SDATA (bidi_it->string.lstring)
2216 : bidi_it->string.s);
2217
2218 if (bidi_it->nchars <= 0)
2219 emacs_abort ();
2220 next_char
2221 = (bidi_it->charpos + bidi_it->nchars >= eob
2222 ? BIDI_EOB
2223 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
2224 bidi_it->string.unibyte));
2225 type_of_next = bidi_get_type (next_char, override);
2226
2227 if (type_of_next == WEAK_ET
2228 || type_of_next == WEAK_BN
2229 || bidi_explicit_dir_char (next_char))
2230 {
2231 bidi_copy_it (&saved_it, bidi_it);
2232 while (bidi_resolve_explicit (bidi_it) == new_level
2233 && (bidi_it->type == WEAK_BN
2234 || bidi_it->type == WEAK_ET))
2235 type_of_next = bidi_it->type;
2236 if (type == WEAK_BN
2237 && bidi_it->charpos == saved_it.charpos + saved_it.nchars)
2238 {
2239 /* If we entered the above loop with a BN that
2240 changes the level, the type of next
2241 character, which is in a different level, is
2242 not relevant to resolving this series of ET
2243 and BN. */
2244 en_pos = saved_it.charpos;
2245 type_of_next = type;
2246 }
2247 else
2248 en_pos = bidi_it->charpos;
2249 bidi_copy_it (bidi_it, &saved_it);
2250 }
2251 /* Remember this position, to speed up processing of the
2252 next ETs. */
2253 bidi_it->next_en_pos = en_pos;
2254 if (type_of_next == WEAK_EN)
2255 {
2256 /* If the last strong character is AL, the EN we've
2257 found will become AN when we get to it (W2). */
2258 if (bidi_it->last_strong.type == STRONG_AL)
2259 type_of_next = WEAK_AN;
2260 else if (type == WEAK_BN)
2261 type = NEUTRAL_ON; /* W6/Retaining */
2262 else
2263 type = WEAK_EN;
2264 }
2265 else if (type_of_next == NEUTRAL_B)
2266 /* Record the fact that there are no more ENs from
2267 here to the end of paragraph, to avoid entering the
2268 loop above ever again in this paragraph. */
2269 bidi_it->next_en_pos = -1;
2270 /* Record the type of the character where we ended our search. */
2271 bidi_it->next_en_type = type_of_next;
2272 }
2273 }
2274 }
2275
2276 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
2277 || (type == WEAK_BN
2278 && (bidi_it->prev.type == WEAK_CS /* W6/Retaining */
2279 || bidi_it->prev.type == WEAK_ES
2280 || bidi_it->prev.type == WEAK_ET)))
2281 type = NEUTRAL_ON;
2282
2283 /* Store the type we've got so far, before we clobber it with strong
2284 types in W7 and while resolving neutral types. But leave alone
2285 the original types that were recorded above, because we will need
2286 them for the L1 clause. */
2287 if (bidi_it->type_after_wn == UNKNOWN_BT)
2288 bidi_it->type_after_wn = type;
2289 bidi_check_type (bidi_it->type_after_wn);
2290
2291 if (type == WEAK_EN) /* W7 */
2292 {
2293 if ((bidi_it->last_strong.type == STRONG_L)
2294 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sos == L2R))
2295 type = STRONG_L;
2296 }
2297
2298 bidi_it->type = type;
2299 bidi_check_type (bidi_it->type);
2300 return type;
2301 }
2302
2303 /* Resolve the type of a neutral character according to the type of
2304 surrounding strong text and the current embedding level. */
2305 static bidi_type_t
2306 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
2307 {
2308 /* N1: "European and Arabic numbers act as if they were R in terms
2309 of their influence on NIs." */
2310 if (next_type == WEAK_EN || next_type == WEAK_AN)
2311 next_type = STRONG_R;
2312 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
2313 prev_type = STRONG_R;
2314
2315 if (next_type == prev_type) /* N1 */
2316 return next_type;
2317 else if ((lev & 1) == 0) /* N2 */
2318 return STRONG_L;
2319 else
2320 return STRONG_R;
2321 }
2322
2323 #define FLAG_EMBEDDING_INSIDE 1
2324 #define FLAG_OPPOSITE_INSIDE 2
2325
2326 /* A data type used in the stack maintained by
2327 bidi_find_bracket_pairs below. */
2328 typedef struct bpa_stack_entry {
2329 int close_bracket_char;
2330 int open_bracket_idx;
2331 #ifdef ENABLE_CHECKING
2332 ptrdiff_t open_bracket_pos;
2333 #endif
2334 unsigned flags : 2;
2335 } bpa_stack_entry;
2336
2337 /* With MAX_ALLOCA of 16KB, this should allow at least 1K slots in the
2338 BPA stack, which should be more than enough for actual bidi text. */
2339 #define MAX_BPA_STACK (max (MAX_ALLOCA / sizeof (bpa_stack_entry), 1))
2340
2341 /* UAX#9 says to match opening brackets with the matching closing
2342 brackets or their canonical equivalents. As of Unicode 7.0, there
2343 are only 2 bracket characters that have canonical equivalence
2344 decompositions: u+2329 and u+232A. So instead of accessing the
2345 table in uni-decomposition.el, we just handle these 2 characters
2346 with this simple macro. Note that ASCII characters don't have
2347 canonical equivalents by definition. */
2348
2349 /* To find all the characters that need to be processed by
2350 CANONICAL_EQU, first find all the characters which have
2351 decompositions in UnicodeData.txt, with this Awk script:
2352
2353 awk -F ";" " {if ($6 != \"\") print $1, $6}" UnicodeData.txt
2354
2355 Then produce a list of all the bracket characters in BidiBrackets.txt:
2356
2357 awk -F "[ ;]" " {if ($1 != \"#\" && $1 != \"\") print $1}" BidiBrackets.txt
2358
2359 And finally, cross-reference these two:
2360
2361 fgrep -w -f brackets.txt decompositions.txt
2362
2363 where "decompositions.txt" was produced by the 1st script, and
2364 "brackets.txt" by the 2nd script. In the output of fgrep, look
2365 only for decompositions that don't begin with some compatibility
2366 formatting tag, such as "<compat>". Only decompositions that
2367 consist solely of character codepoints are relevant to bidi
2368 brackets processing. */
2369
2370 #define CANONICAL_EQU(c) \
2371 ( ASCII_CHAR_P (c) ? c \
2372 : (c) == 0x2329 ? 0x3008 \
2373 : (c) == 0x232a ? 0x3009 \
2374 : c )
2375
2376 #ifdef ENABLE_CHECKING
2377 # define STORE_BRACKET_CHARPOS \
2378 bpa_stack[bpa_sp].open_bracket_pos = bidi_it->charpos
2379 #else
2380 # define STORE_BRACKET_CHARPOS /* nothing */
2381 #endif
2382
2383 #define PUSH_BPA_STACK \
2384 do { \
2385 int ch; \
2386 bpa_sp++; \
2387 if (bpa_sp >= MAX_BPA_STACK) \
2388 { \
2389 bpa_sp = MAX_BPA_STACK - 1; \
2390 goto bpa_give_up; \
2391 } \
2392 ch = CANONICAL_EQU (bidi_it->ch); \
2393 bpa_stack[bpa_sp].close_bracket_char = bidi_mirror_char (ch); \
2394 bpa_stack[bpa_sp].open_bracket_idx = bidi_cache_last_idx; \
2395 bpa_stack[bpa_sp].flags = 0; \
2396 STORE_BRACKET_CHARPOS; \
2397 } while (0)
2398
2399
2400 /* This function implements BPA, the Bidi Parenthesis Algorithm,
2401 described in BD16 and N0 of UAX#9. It finds all the bracket pairs
2402 in the current isolating sequence, and records the enclosed type
2403 and the position of the matching bracket in the cache. It returns
2404 non-zero if called with the iterator on the opening bracket which
2405 has a matching closing bracket in the current isolating sequence,
2406 zero otherwise. */
2407 static bool
2408 bidi_find_bracket_pairs (struct bidi_it *bidi_it)
2409 {
2410 bidi_bracket_type_t btype;
2411 bidi_type_t type = bidi_it->type;
2412 bool retval = false;
2413
2414 /* When scanning backwards, we don't expect any unresolved bidi
2415 bracket characters. */
2416 if (bidi_it->scan_dir != 1)
2417 emacs_abort ();
2418
2419 btype = bidi_paired_bracket_type (bidi_it->ch);
2420 if (btype == BIDI_BRACKET_OPEN)
2421 {
2422 bpa_stack_entry bpa_stack[MAX_BPA_STACK];
2423 int bpa_sp = -1;
2424 struct bidi_it saved_it;
2425 int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2426 bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2427 struct bidi_it tem_it;
2428
2429 eassert (MAX_BPA_STACK >= 100);
2430 bidi_copy_it (&saved_it, bidi_it);
2431 /* bidi_cache_iterator_state refuses to cache on backward scans,
2432 and bidi_cache_fetch_state doesn't bring scan_dir from the
2433 cache, so we must initialize this explicitly. */
2434 tem_it.scan_dir = 1;
2435
2436 while (1)
2437 {
2438 int old_sidx, new_sidx;
2439 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2440
2441 /* Mark every opening bracket character we've traversed by
2442 putting its own position into bracket_pairing_pos. This
2443 is examined in bidi_resolve_brackets to distinguish
2444 brackets that were already resolved to stay NEUTRAL_ON,
2445 and those that were not yet processed by this function
2446 (because they were skipped when we skip higher embedding
2447 levels below). */
2448 if (btype == BIDI_BRACKET_OPEN && bidi_it->bracket_pairing_pos == -1)
2449 bidi_it->bracket_pairing_pos = bidi_it->charpos;
2450 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2451 if (btype == BIDI_BRACKET_OPEN)
2452 PUSH_BPA_STACK;
2453 else if (btype == BIDI_BRACKET_CLOSE)
2454 {
2455 int sp = bpa_sp;
2456 int curchar = CANONICAL_EQU (bidi_it->ch);
2457
2458 eassert (sp >= 0);
2459 while (sp >= 0 && bpa_stack[sp].close_bracket_char != curchar)
2460 sp--;
2461 if (sp >= 0)
2462 {
2463 /* Update and cache the corresponding opening bracket. */
2464 bidi_cache_fetch_state (bpa_stack[sp].open_bracket_idx,
2465 &tem_it);
2466 #ifdef ENABLE_CHECKING
2467 eassert (bpa_stack[sp].open_bracket_pos == tem_it.charpos);
2468 #endif
2469 /* Determine the enclosed type for this bracket
2470 pair's type resolution according to N0. */
2471 if (bpa_stack[sp].flags & FLAG_EMBEDDING_INSIDE)
2472 tem_it.bracket_enclosed_type = embedding_type; /* N0b */
2473 else if (bpa_stack[sp].flags & FLAG_OPPOSITE_INSIDE)
2474 tem_it.bracket_enclosed_type /* N0c */
2475 = (embedding_type == STRONG_L ? STRONG_R : STRONG_L);
2476 else /* N0d */
2477 tem_it.bracket_enclosed_type = UNKNOWN_BT;
2478
2479 /* Record the position of the matching closing
2480 bracket, and update the cache. */
2481 tem_it.bracket_pairing_pos = bidi_it->charpos;
2482 bidi_cache_iterator_state (&tem_it, 0, 1);
2483
2484 /* Pop the BPA stack. */
2485 bpa_sp = sp - 1;
2486 }
2487 if (bpa_sp < 0)
2488 {
2489 retval = true;
2490 break;
2491 }
2492 }
2493 else if (bidi_get_category (bidi_it->type_after_wn) != NEUTRAL)
2494 {
2495 unsigned flag = 0;
2496 int sp;
2497
2498 /* Whenever we see a strong type, update the flags of
2499 all the slots on the stack. */
2500 switch (bidi_it->type)
2501 {
2502 case STRONG_L:
2503 flag = ((embedding_level & 1) == 0
2504 ? FLAG_EMBEDDING_INSIDE
2505 : FLAG_OPPOSITE_INSIDE);
2506 break;
2507 case STRONG_R:
2508 case WEAK_EN:
2509 case WEAK_AN:
2510 flag = ((embedding_level & 1) == 1
2511 ? FLAG_EMBEDDING_INSIDE
2512 : FLAG_OPPOSITE_INSIDE);
2513 break;
2514 default:
2515 break;
2516 }
2517 if (flag)
2518 {
2519 for (sp = bpa_sp; sp >= 0; sp--)
2520 bpa_stack[sp].flags |= flag;
2521 }
2522 }
2523 old_sidx = bidi_it->stack_idx;
2524 type = bidi_resolve_weak (bidi_it);
2525 /* Skip level runs excluded from this isolating run sequence. */
2526 new_sidx = bidi_it->stack_idx;
2527 if (bidi_it->level_stack[new_sidx].level > current_level
2528 && (bidi_it->level_stack[new_sidx].isolate_status
2529 || (new_sidx > old_sidx + 1
2530 && bidi_it->level_stack[new_sidx - 1].isolate_status)))
2531 {
2532 while (bidi_it->level_stack[bidi_it->stack_idx].level
2533 > current_level)
2534 {
2535 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2536 type = bidi_resolve_weak (bidi_it);
2537 }
2538 }
2539 if (type == NEUTRAL_B
2540 || (bidi_it->level_stack[bidi_it->stack_idx].level
2541 != current_level))
2542 {
2543 bpa_give_up:
2544 /* We've marched all the way to the end of this
2545 isolating run sequence, and didn't find matching
2546 closing brackets for some opening brackets. Leave
2547 their type unchanged. */
2548 break;
2549 }
2550 if (bidi_it->type_after_wn == NEUTRAL_ON) /* Unicode 8.0 correction */
2551 btype = bidi_paired_bracket_type (bidi_it->ch);
2552 else
2553 btype = BIDI_BRACKET_NONE;
2554 }
2555
2556 /* Restore bidi_it from the cache, which should have the bracket
2557 resolution members set as determined by the above loop. */
2558 type = bidi_cache_find (saved_it.charpos, 0, bidi_it);
2559 eassert (type == NEUTRAL_ON);
2560 }
2561
2562 return retval;
2563 }
2564
2565 static void
2566 bidi_record_type_for_neutral (struct bidi_saved_info *info, int level,
2567 bool nextp)
2568 {
2569 int idx;
2570
2571 for (idx = bidi_cache_last_idx + 1; idx < bidi_cache_idx; idx++)
2572 {
2573 int lev = bidi_cache[idx].level_stack[bidi_cache[idx].stack_idx].level;
2574
2575 if (lev <= level)
2576 {
2577 eassert (lev == level);
2578 if (nextp)
2579 bidi_cache[idx].next_for_neutral = *info;
2580 else
2581 bidi_cache[idx].prev_for_neutral = *info;
2582 break;
2583 }
2584 }
2585 }
2586
2587 static bidi_type_t
2588 bidi_resolve_brackets (struct bidi_it *bidi_it)
2589 {
2590 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2591 bool resolve_bracket = false;
2592 bidi_type_t type = UNKNOWN_BT;
2593 int ch;
2594 struct bidi_saved_info prev_for_neutral, next_for_neutral;
2595
2596 /* Record the prev_for_neutral type either from the previous
2597 character, if it was a strong or AN/EN, or from the
2598 prev_for_neutral information recorded previously. */
2599 if (bidi_it->type == STRONG_L || bidi_it->type == STRONG_R
2600 || bidi_it->type == WEAK_AN || bidi_it->type == WEAK_EN)
2601 bidi_remember_char (&prev_for_neutral, bidi_it, 1);
2602 else
2603 prev_for_neutral = bidi_it->prev_for_neutral;
2604 /* Record the next_for_neutral type information. */
2605 if (bidi_it->next_for_neutral.charpos > bidi_it->charpos)
2606 next_for_neutral = bidi_it->next_for_neutral;
2607 else
2608 next_for_neutral.charpos = -1;
2609 if (!bidi_it->first_elt)
2610 {
2611 type = bidi_cache_find (bidi_it->charpos + bidi_it->nchars, 0, bidi_it);
2612 ch = bidi_it->ch;
2613 }
2614 if (type == UNKNOWN_BT)
2615 {
2616 type = bidi_resolve_weak (bidi_it);
2617 if (type == NEUTRAL_ON && bidi_find_bracket_pairs (bidi_it))
2618 resolve_bracket = true;
2619 }
2620 else
2621 {
2622 eassert (bidi_it->resolved_level == -1);
2623 /* If the cached state shows an increase of embedding level due
2624 to an isolate initiator, we need to update the 1st cached
2625 state of the next run of the current isolating sequence with
2626 the prev_for_neutral and next_for_neutral information, so
2627 that it will be picked up when we advance to that next run. */
2628 if (bidi_it->level_stack[bidi_it->stack_idx].level > prev_level
2629 && bidi_it->level_stack[bidi_it->stack_idx].isolate_status)
2630 {
2631 bidi_record_type_for_neutral (&prev_for_neutral, prev_level, 0);
2632 bidi_record_type_for_neutral (&next_for_neutral, prev_level, 1);
2633 }
2634 if (type == NEUTRAL_ON
2635 && bidi_paired_bracket_type (ch) == BIDI_BRACKET_OPEN)
2636 {
2637 if (bidi_it->bracket_pairing_pos > bidi_it->charpos)
2638 {
2639 /* A cached opening bracket that wasn't completely
2640 resolved yet. */
2641 resolve_bracket = true;
2642 }
2643 else if (bidi_it->bracket_pairing_pos == -1)
2644 {
2645 /* Higher levels were not BPA-resolved yet, even if
2646 cached by bidi_find_bracket_pairs. Force application
2647 of BPA to the new level now. */
2648 if (bidi_find_bracket_pairs (bidi_it))
2649 resolve_bracket = true;
2650 }
2651 }
2652 /* Keep track of the prev_for_neutral and next_for_neutral
2653 types, needed for resolving brackets below and for resolving
2654 neutrals in bidi_resolve_neutral. */
2655 if (bidi_it->level_stack[bidi_it->stack_idx].level == prev_level)
2656 {
2657 bidi_it->prev_for_neutral = prev_for_neutral;
2658 if (next_for_neutral.charpos > 0)
2659 bidi_it->next_for_neutral = next_for_neutral;
2660 }
2661 }
2662
2663 /* If needed, resolve the bracket type according to N0. */
2664 if (resolve_bracket)
2665 {
2666 int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2667 bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2668
2669 eassert (bidi_it->prev_for_neutral.type != UNKNOWN_BT);
2670 eassert (bidi_it->bracket_pairing_pos > bidi_it->charpos);
2671 if (bidi_it->bracket_enclosed_type == embedding_type) /* N0b */
2672 type = embedding_type;
2673 else
2674 {
2675 switch (bidi_it->prev_for_neutral.type)
2676 {
2677 case STRONG_R:
2678 case WEAK_EN:
2679 case WEAK_AN:
2680 type =
2681 (bidi_it->bracket_enclosed_type == STRONG_R) /* N0c */
2682 ? STRONG_R /* N0c1 */
2683 : embedding_type; /* N0c2 */
2684 break;
2685 case STRONG_L:
2686 type =
2687 (bidi_it->bracket_enclosed_type == STRONG_L) /* N0c */
2688 ? STRONG_L /* N0c1 */
2689 : embedding_type; /* N0c2 */
2690 break;
2691 default:
2692 /* N0d: Do not set the type for that bracket pair. */
2693 break;
2694 }
2695 }
2696 eassert (type == STRONG_L || type == STRONG_R || type == NEUTRAL_ON);
2697
2698 /* Update the type of the paired closing bracket to the same
2699 type as for the resolved opening bracket. */
2700 if (type != NEUTRAL_ON)
2701 {
2702 ptrdiff_t idx = bidi_cache_search (bidi_it->bracket_pairing_pos,
2703 -1, 1);
2704
2705 if (idx < bidi_cache_start)
2706 emacs_abort ();
2707 bidi_cache[idx].type = type;
2708 }
2709 }
2710
2711 return type;
2712 }
2713
2714 static bidi_type_t
2715 bidi_resolve_neutral (struct bidi_it *bidi_it)
2716 {
2717 bidi_type_t type = bidi_resolve_brackets (bidi_it);
2718 int current_level;
2719 bool is_neutral;
2720
2721 eassert (type == STRONG_R
2722 || type == STRONG_L
2723 || type == WEAK_BN
2724 || type == WEAK_EN
2725 || type == WEAK_AN
2726 || type == NEUTRAL_B
2727 || type == NEUTRAL_S
2728 || type == NEUTRAL_WS
2729 || type == NEUTRAL_ON
2730 || type == LRI
2731 || type == RLI
2732 || type == PDI);
2733
2734 current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2735 eassert (current_level >= 0);
2736 is_neutral = bidi_get_category (type) == NEUTRAL;
2737
2738 if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
2739 we are already at paragraph end. */
2740 && (is_neutral || bidi_isolate_fmt_char (type)))
2741 /* N1-N2/Retaining */
2742 || (type == WEAK_BN && bidi_explicit_dir_char (bidi_it->ch)))
2743 {
2744 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
2745 {
2746 /* Make sure the data for resolving neutrals we are
2747 about to use is valid. */
2748 eassert (bidi_it->next_for_neutral.charpos > bidi_it->charpos
2749 /* PDI defines an eos, so it's OK for it to
2750 serve as its own next_for_neutral. */
2751 || (bidi_it->next_for_neutral.charpos == bidi_it->charpos
2752 && bidi_it->type == PDI));
2753 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2754 bidi_it->next_for_neutral.type,
2755 current_level);
2756 }
2757 /* The next two "else if" clauses are shortcuts for the
2758 important special case when we have a long sequence of
2759 neutral or WEAK_BN characters, such as whitespace or nulls or
2760 other control characters, on the base embedding level of the
2761 paragraph, and that sequence goes all the way to the end of
2762 the paragraph and follows a character whose resolved
2763 directionality is identical to the base embedding level.
2764 (This is what happens in a buffer with plain L2R text that
2765 happens to include long sequences of control characters.) By
2766 virtue of N1, the result of examining this long sequence will
2767 always be either STRONG_L or STRONG_R, depending on the base
2768 embedding level. So we use this fact directly instead of
2769 entering the expensive loop in the "else" clause. */
2770 else if (current_level == 0
2771 && bidi_it->prev_for_neutral.type == STRONG_L
2772 && !bidi_explicit_dir_char (bidi_it->ch)
2773 && !bidi_isolate_fmt_char (type))
2774 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2775 STRONG_L, current_level);
2776 else if (/* current level is 1 */
2777 current_level == 1
2778 /* base embedding level is also 1 */
2779 && bidi_it->level_stack[0].level == 1
2780 /* previous character is one of those considered R for
2781 the purposes of W5 */
2782 && (bidi_it->prev_for_neutral.type == STRONG_R
2783 || bidi_it->prev_for_neutral.type == WEAK_EN
2784 || bidi_it->prev_for_neutral.type == WEAK_AN)
2785 && !bidi_explicit_dir_char (bidi_it->ch)
2786 && !bidi_isolate_fmt_char (type))
2787 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2788 STRONG_R, current_level);
2789 else
2790 {
2791 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
2792 the assumption of batch-style processing; see clauses W4,
2793 W5, and especially N1, which require to look far forward
2794 (as well as back) in the buffer/string. May the fleas of
2795 a thousand camels infest the armpits of those who design
2796 supposedly general-purpose algorithms by looking at their
2797 own implementations, and fail to consider other possible
2798 implementations! */
2799 struct bidi_it saved_it;
2800 bidi_type_t next_type;
2801 bool adjacent_to_neutrals = is_neutral;
2802
2803 bidi_copy_it (&saved_it, bidi_it);
2804 /* Scan the text forward until we find the first non-neutral
2805 character, and then use that to resolve the neutral we
2806 are dealing with now. We also cache the scanned iterator
2807 states, to salvage some of the effort later. */
2808 do {
2809 int old_sidx, new_sidx;
2810
2811 /* Paragraph separators have their levels fully resolved
2812 at this point, so cache them as resolved. */
2813 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2814 old_sidx = bidi_it->stack_idx;
2815 type = bidi_resolve_brackets (bidi_it);
2816 /* Skip level runs excluded from this isolating run sequence. */
2817 new_sidx = bidi_it->stack_idx;
2818 if (bidi_it->level_stack[new_sidx].level > current_level
2819 && (bidi_it->level_stack[new_sidx].isolate_status
2820 /* This is for when we have an isolate initiator
2821 immediately followed by an embedding or
2822 override initiator, in which case we get the
2823 level stack pushed twice by the single call to
2824 bidi_resolve_weak above. */
2825 || (new_sidx > old_sidx + 1
2826 && bidi_it->level_stack[new_sidx - 1].isolate_status)))
2827 {
2828 while (bidi_it->level_stack[bidi_it->stack_idx].level
2829 > current_level)
2830 {
2831 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2832 type = bidi_resolve_brackets (bidi_it);
2833 }
2834 }
2835 if (!adjacent_to_neutrals
2836 && (bidi_get_category (type) == NEUTRAL
2837 || bidi_isolate_fmt_char (type)))
2838 adjacent_to_neutrals = true;
2839 } while (!(type == NEUTRAL_B
2840 || (type != WEAK_BN
2841 && bidi_get_category (type) != NEUTRAL
2842 && !bidi_isolate_fmt_char (type))
2843 /* This is all per level run, so stop when we
2844 reach the end of this level run. */
2845 || (bidi_it->level_stack[bidi_it->stack_idx].level
2846 != current_level)));
2847
2848 /* Record the character we stopped at. */
2849 bidi_remember_char (&saved_it.next_for_neutral, bidi_it, 1);
2850
2851 if ((bidi_it->level_stack[bidi_it->stack_idx].level != current_level)
2852 || type == NEUTRAL_B)
2853 {
2854 /* Marched all the way to the end of this level run. We
2855 need to use the eos type, whose information is stored
2856 by bidi_set_sos_type in the prev_for_neutral
2857 member. */
2858 if (adjacent_to_neutrals)
2859 next_type = bidi_it->prev_for_neutral.type;
2860 else
2861 {
2862 /* This is a BN which does not adjoin neutrals.
2863 Leave its type alone. */
2864 bidi_copy_it (bidi_it, &saved_it);
2865 return bidi_it->type;
2866 }
2867 }
2868 else
2869 {
2870 switch (type)
2871 {
2872 case STRONG_L:
2873 case STRONG_R:
2874 case STRONG_AL:
2875 /* Actually, STRONG_AL cannot happen here, because
2876 bidi_resolve_weak converts it to STRONG_R, per W3. */
2877 eassert (type != STRONG_AL);
2878 next_type = type;
2879 break;
2880 case WEAK_EN:
2881 case WEAK_AN:
2882 /* N1: "European and Arabic numbers act as if they
2883 were R in terms of their influence on NIs." */
2884 next_type = STRONG_R;
2885 break;
2886 default:
2887 emacs_abort ();
2888 break;
2889 }
2890 }
2891 /* Resolve the type of all the NIs found during the above loop. */
2892 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
2893 next_type, current_level);
2894 /* Update next_for_neutral with the resolved type, so we
2895 could use it for all the other NIs up to the place where
2896 we exited the loop. */
2897 saved_it.next_for_neutral.type = next_type;
2898 bidi_check_type (type);
2899 /* Update the character which caused us to enter the above loop. */
2900 saved_it.type = type;
2901 bidi_check_type (next_type);
2902 bidi_copy_it (bidi_it, &saved_it);
2903 }
2904 }
2905 return type;
2906 }
2907
2908 /* Given an iterator state in BIDI_IT, advance one character position
2909 in the buffer/string to the next character (in the logical order),
2910 resolve the bidi type of that next character, and return that
2911 type. */
2912 static bidi_type_t
2913 bidi_type_of_next_char (struct bidi_it *bidi_it)
2914 {
2915 bidi_type_t type;
2916
2917 /* This should always be called during a forward scan. */
2918 if (bidi_it->scan_dir != 1)
2919 emacs_abort ();
2920
2921 type = bidi_resolve_neutral (bidi_it);
2922
2923 return type;
2924 }
2925
2926 /* Given an iterator state BIDI_IT, advance one character position in
2927 the buffer/string to the next character (in the current scan
2928 direction), resolve the embedding and implicit levels of that next
2929 character, and return the resulting level. */
2930 static int
2931 bidi_level_of_next_char (struct bidi_it *bidi_it)
2932 {
2933 bidi_type_t type = UNKNOWN_BT;
2934 int level;
2935 ptrdiff_t next_char_pos = -2;
2936
2937 if (bidi_it->scan_dir == 1)
2938 {
2939 ptrdiff_t eob
2940 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2941 ? bidi_it->string.schars : ZV);
2942
2943 /* There's no sense in trying to advance if we've already hit
2944 the end of text. */
2945 if (bidi_it->charpos >= eob)
2946 {
2947 eassert (bidi_it->resolved_level >= 0);
2948 return bidi_it->resolved_level;
2949 }
2950 }
2951
2952 /* Perhaps the character we want is already cached s fully resolved.
2953 If it is, the call to bidi_cache_find below will return a type
2954 other than UNKNOWN_BT. */
2955 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
2956 {
2957 int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2958 ? 0 : 1);
2959
2960 if (bidi_it->scan_dir > 0)
2961 {
2962 if (bidi_it->nchars <= 0)
2963 emacs_abort ();
2964 next_char_pos = bidi_it->charpos + bidi_it->nchars;
2965 }
2966 else if (bidi_it->charpos >= bob)
2967 /* Implementation note: we allow next_char_pos to be as low as
2968 0 for buffers or -1 for strings, and that is okay because
2969 that's the "position" of the sentinel iterator state we
2970 cached at the beginning of the iteration. */
2971 next_char_pos = bidi_it->charpos - 1;
2972 if (next_char_pos >= bob - 1)
2973 type = bidi_cache_find (next_char_pos, 1, bidi_it);
2974 if (type != UNKNOWN_BT)
2975 {
2976 /* We asked the cache for fully resolved states. */
2977 eassert (bidi_it->resolved_level >= 0);
2978 return bidi_it->resolved_level;
2979 }
2980 }
2981
2982 if (bidi_it->scan_dir == -1)
2983 /* If we are going backwards, the iterator state is already cached
2984 from previous scans, and should be fully resolved. */
2985 emacs_abort ();
2986
2987 if (type == UNKNOWN_BT)
2988 type = bidi_type_of_next_char (bidi_it);
2989
2990 if (type == NEUTRAL_B)
2991 {
2992 eassert (bidi_it->resolved_level >= 0);
2993 return bidi_it->resolved_level;
2994 }
2995
2996 level = bidi_it->level_stack[bidi_it->stack_idx].level;
2997
2998 eassert ((type == STRONG_R
2999 || type == STRONG_L
3000 || type == WEAK_BN
3001 || type == WEAK_EN
3002 || type == WEAK_AN));
3003 bidi_it->type = type;
3004 bidi_check_type (bidi_it->type);
3005
3006 /* For L1 below, we need to know, for each WS character, whether
3007 it belongs to a sequence of WS characters preceding a newline
3008 or a TAB or a paragraph separator. */
3009 if ((bidi_it->orig_type == NEUTRAL_WS
3010 || bidi_isolate_fmt_char (bidi_it->orig_type))
3011 && bidi_it->next_for_ws.charpos < bidi_it->charpos)
3012 {
3013 int ch;
3014 ptrdiff_t clen = bidi_it->ch_len;
3015 ptrdiff_t bpos = bidi_it->bytepos;
3016 ptrdiff_t cpos = bidi_it->charpos;
3017 ptrdiff_t disp_pos = bidi_it->disp_pos;
3018 ptrdiff_t nc = bidi_it->nchars;
3019 struct bidi_string_data bs = bidi_it->string;
3020 bidi_type_t chtype;
3021 bool fwp = bidi_it->frame_window_p;
3022 int dpp = bidi_it->disp_prop;
3023
3024 if (bidi_it->nchars <= 0)
3025 emacs_abort ();
3026 do {
3027 ch = bidi_fetch_char (cpos += nc, bpos += clen, &disp_pos, &dpp, &bs,
3028 bidi_it->w, fwp, &clen, &nc);
3029 chtype = bidi_get_type (ch, NEUTRAL_DIR);
3030 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
3031 || bidi_isolate_fmt_char (chtype)
3032 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
3033 bidi_it->next_for_ws.type = chtype;
3034 bidi_check_type (bidi_it->next_for_ws.type);
3035 bidi_it->next_for_ws.charpos = cpos;
3036 }
3037
3038 /* Update the cache, but only if this state was already cached. */
3039 bidi_cache_iterator_state (bidi_it, 1, 1);
3040
3041 /* Resolve implicit levels. */
3042 if (bidi_it->orig_type == NEUTRAL_B /* L1 */
3043 || bidi_it->orig_type == NEUTRAL_S
3044 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
3045 || (bidi_it->orig_type == NEUTRAL_WS
3046 && (bidi_it->next_for_ws.type == NEUTRAL_B
3047 || bidi_it->next_for_ws.type == NEUTRAL_S)))
3048 level = bidi_it->level_stack[0].level;
3049 else if ((level & 1) == 0) /* I1 */
3050 {
3051 if (type == STRONG_R)
3052 level++;
3053 else if (type == WEAK_EN || type == WEAK_AN)
3054 level += 2;
3055 }
3056 else /* I2 */
3057 {
3058 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
3059 level++;
3060 }
3061
3062 bidi_it->resolved_level = level;
3063 return level;
3064 }
3065
3066 /* Move to the other edge of a level given by LEVEL. If END_FLAG,
3067 we are at the end of a level, and we need to prepare to
3068 resume the scan of the lower level.
3069
3070 If this level's other edge is cached, we simply jump to it, filling
3071 the iterator structure with the iterator state on the other edge.
3072 Otherwise, we walk the buffer or string until we come back to the
3073 same level as LEVEL.
3074
3075 Note: we are not talking here about a ``level run'' in the UAX#9
3076 sense of the term, but rather about a ``level'' which includes
3077 all the levels higher than it. In other words, given the levels
3078 like this:
3079
3080 11111112222222333333334443343222222111111112223322111
3081 A B C
3082
3083 and assuming we are at point A scanning left to right, this
3084 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
3085 at point B. */
3086 static void
3087 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, bool end_flag)
3088 {
3089 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
3090 ptrdiff_t idx;
3091
3092 /* Try the cache first. */
3093 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
3094 >= bidi_cache_start)
3095 bidi_cache_fetch_state (idx, bidi_it);
3096 else
3097 {
3098 int new_level;
3099
3100 /* If we are at end of level, its edges must be cached. */
3101 if (end_flag)
3102 emacs_abort ();
3103
3104 bidi_cache_iterator_state (bidi_it, 1, 0);
3105 do {
3106 new_level = bidi_level_of_next_char (bidi_it);
3107 bidi_cache_iterator_state (bidi_it, 1, 0);
3108 } while (new_level >= level);
3109 }
3110 }
3111
3112 void
3113 bidi_move_to_visually_next (struct bidi_it *bidi_it)
3114 {
3115 int old_level, new_level, next_level;
3116 struct bidi_it sentinel;
3117 struct gcpro gcpro1;
3118
3119 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
3120 emacs_abort ();
3121
3122 if (bidi_it->scan_dir == 0)
3123 {
3124 bidi_it->scan_dir = 1; /* default to logical order */
3125 }
3126
3127 /* The code below can call eval, and thus cause GC. If we are
3128 iterating a Lisp string, make sure it won't be GCed. */
3129 if (STRINGP (bidi_it->string.lstring))
3130 GCPRO1 (bidi_it->string.lstring);
3131
3132 /* If we just passed a newline, initialize for the next line. */
3133 if (!bidi_it->first_elt
3134 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3135 bidi_line_init (bidi_it);
3136
3137 /* Prepare the sentinel iterator state, and cache it. When we bump
3138 into it, scanning backwards, we'll know that the last non-base
3139 level is exhausted. */
3140 if (bidi_cache_idx == bidi_cache_start)
3141 {
3142 bidi_copy_it (&sentinel, bidi_it);
3143 if (bidi_it->first_elt)
3144 {
3145 sentinel.charpos--; /* cached charpos needs to be monotonic */
3146 sentinel.bytepos--;
3147 sentinel.ch = '\n'; /* doesn't matter, but why not? */
3148 sentinel.ch_len = 1;
3149 sentinel.nchars = 1;
3150 }
3151 bidi_cache_iterator_state (&sentinel, 1, 0);
3152 }
3153
3154 old_level = bidi_it->resolved_level;
3155 new_level = bidi_level_of_next_char (bidi_it);
3156
3157 /* Reordering of resolved levels (clause L2) is implemented by
3158 jumping to the other edge of the level and flipping direction of
3159 scanning the text whenever we find a level change. */
3160 if (new_level != old_level)
3161 {
3162 bool ascending = new_level > old_level;
3163 int level_to_search = ascending ? old_level + 1 : old_level;
3164 int incr = ascending ? 1 : -1;
3165 int expected_next_level = old_level + incr;
3166
3167 /* Jump (or walk) to the other edge of this level. */
3168 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3169 /* Switch scan direction and peek at the next character in the
3170 new direction. */
3171 bidi_it->scan_dir = -bidi_it->scan_dir;
3172
3173 /* The following loop handles the case where the resolved level
3174 jumps by more than one. This is typical for numbers inside a
3175 run of text with left-to-right embedding direction, but can
3176 also happen in other situations. In those cases the decision
3177 where to continue after a level change, and in what direction,
3178 is tricky. For example, given a text like below:
3179
3180 abcdefgh
3181 11336622
3182
3183 (where the numbers below the text show the resolved levels),
3184 the result of reordering according to UAX#9 should be this:
3185
3186 efdcghba
3187
3188 This is implemented by the loop below which flips direction
3189 and jumps to the other edge of the level each time it finds
3190 the new level not to be the expected one. The expected level
3191 is always one more or one less than the previous one. */
3192 next_level = bidi_peek_at_next_level (bidi_it);
3193 while (next_level != expected_next_level)
3194 {
3195 /* If next_level is -1, it means we have an unresolved level
3196 in the cache, which at this point should not happen. If
3197 it does, we will infloop. */
3198 eassert (next_level >= 0);
3199 /* If next_level is not consistent with incr, we might
3200 infloop. */
3201 eassert (incr > 0
3202 ? next_level > expected_next_level
3203 : next_level < expected_next_level);
3204 expected_next_level += incr;
3205 level_to_search += incr;
3206 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3207 bidi_it->scan_dir = -bidi_it->scan_dir;
3208 next_level = bidi_peek_at_next_level (bidi_it);
3209 }
3210
3211 /* Finally, deliver the next character in the new direction. */
3212 next_level = bidi_level_of_next_char (bidi_it);
3213 }
3214
3215 /* Take note when we have just processed the newline that precedes
3216 the end of the paragraph. The next time we are about to be
3217 called, set_iterator_to_next will automatically reinit the
3218 paragraph direction, if needed. We do this at the newline before
3219 the paragraph separator, because the next character might not be
3220 the first character of the next paragraph, due to the bidi
3221 reordering, whereas we _must_ know the paragraph base direction
3222 _before_ we process the paragraph's text, since the base
3223 direction affects the reordering. */
3224 if (bidi_it->scan_dir == 1
3225 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3226 {
3227 /* The paragraph direction of the entire string, once
3228 determined, is in effect for the entire string. Setting the
3229 separator limit to the end of the string prevents
3230 bidi_paragraph_init from being called automatically on this
3231 string. */
3232 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3233 bidi_it->separator_limit = bidi_it->string.schars;
3234 else if (bidi_it->bytepos < ZV_BYTE)
3235 {
3236 ptrdiff_t sep_len
3237 = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
3238 bidi_it->bytepos + bidi_it->ch_len);
3239 if (bidi_it->nchars <= 0)
3240 emacs_abort ();
3241 if (sep_len >= 0)
3242 {
3243 bidi_it->new_paragraph = 1;
3244 /* Record the buffer position of the last character of the
3245 paragraph separator. */
3246 bidi_it->separator_limit
3247 = bidi_it->charpos + bidi_it->nchars + sep_len;
3248 }
3249 }
3250 }
3251
3252 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
3253 {
3254 /* If we are at paragraph's base embedding level and beyond the
3255 last cached position, the cache's job is done and we can
3256 discard it. */
3257 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
3258 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
3259 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
3260 bidi_cache_reset ();
3261 /* But as long as we are caching during forward scan, we must
3262 cache each state, or else the cache integrity will be
3263 compromised: it assumes cached states correspond to buffer
3264 positions 1:1. */
3265 else
3266 bidi_cache_iterator_state (bidi_it, 1, 0);
3267 }
3268
3269 eassert (bidi_it->resolved_level >= 0
3270 && bidi_it->resolved_level <= BIDI_MAXDEPTH + 2);
3271
3272 if (STRINGP (bidi_it->string.lstring))
3273 UNGCPRO;
3274 }
3275
3276 /* This is meant to be called from within the debugger, whenever you
3277 wish to examine the cache contents. */
3278 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
3279 void
3280 bidi_dump_cached_states (void)
3281 {
3282 ptrdiff_t i;
3283 int ndigits = 1;
3284
3285 if (bidi_cache_idx == 0)
3286 {
3287 fprintf (stderr, "The cache is empty.\n");
3288 return;
3289 }
3290 fprintf (stderr, "Total of %"pD"d state%s in cache:\n",
3291 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
3292
3293 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
3294 ndigits++;
3295 fputs ("ch ", stderr);
3296 for (i = 0; i < bidi_cache_idx; i++)
3297 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
3298 fputs ("\n", stderr);
3299 fputs ("lvl ", stderr);
3300 for (i = 0; i < bidi_cache_idx; i++)
3301 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
3302 fputs ("\n", stderr);
3303 fputs ("pos ", stderr);
3304 for (i = 0; i < bidi_cache_idx; i++)
3305 fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
3306 fputs ("\n", stderr);
3307 }