]> code.delx.au - gnu-emacs/blob - src/bidi.c
* bidi.c: Integer overflow fix.
[gnu-emacs] / src / bidi.c
1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
2 Copyright (C) 2000-2001, 2004-2005, 2009-2011
3 Free Software Foundation, Inc.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
19
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 (UBA) as per UAX#9, a part of the Unicode Standard.
24
25 Unlike the reference and most other implementations, this one is
26 designed to be called once for every character in the buffer or
27 string.
28
29 The main entry point is bidi_move_to_visually_next. Each time it
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
36 character by resolving their levels on the fly.
37
38 Two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
43 A few auxiliary entry points are used to initialize the bidi
44 iterator for iterating an object (buffer or string), push and pop
45 the bidi iterator state, and save and restore the state of the bidi
46 cache.
47
48 If you want to understand the code, you will have to read it
49 together with the relevant portions of UAX#9. The comments include
50 references to UAX#9 rules, for that very reason.
51
52 A note about references to UAX#9 rules: if the reference says
53 something like "X9/Retaining", it means that you need to refer to
54 rule X9 and to its modifications decribed in the "Implementation
55 Notes" section of UAX#9, under "Retaining Format Codes". */
56
57 #include <config.h>
58 #include <stdio.h>
59 #include <setjmp.h>
60
61 #include "lisp.h"
62 #include "buffer.h"
63 #include "character.h"
64 #include "dispextern.h"
65
66 static int bidi_initialized = 0;
67
68 static Lisp_Object bidi_type_table, bidi_mirror_table;
69
70 #define LRM_CHAR 0x200E
71 #define RLM_CHAR 0x200F
72 #define BIDI_EOB -1
73
74 /* Data type for describing the bidirectional character categories. */
75 typedef enum {
76 UNKNOWN_BC,
77 NEUTRAL,
78 WEAK,
79 STRONG
80 } bidi_category_t;
81
82 extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
83 int bidi_ignore_explicit_marks_for_paragraph_level = 1;
84
85 static Lisp_Object paragraph_start_re, paragraph_separate_re;
86 static Lisp_Object Qparagraph_start, Qparagraph_separate;
87
88 \f
89 /***********************************************************************
90 Utilities
91 ***********************************************************************/
92
93 /* Return the bidi type of a character CH, subject to the current
94 directional OVERRIDE. */
95 static inline bidi_type_t
96 bidi_get_type (int ch, bidi_dir_t override)
97 {
98 bidi_type_t default_type;
99
100 if (ch == BIDI_EOB)
101 return NEUTRAL_B;
102 if (ch < 0 || ch > MAX_CHAR)
103 abort ();
104
105 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
106
107 if (override == NEUTRAL_DIR)
108 return default_type;
109
110 switch (default_type)
111 {
112 /* Although UAX#9 does not tell, it doesn't make sense to
113 override NEUTRAL_B and LRM/RLM characters. */
114 case NEUTRAL_B:
115 case LRE:
116 case LRO:
117 case RLE:
118 case RLO:
119 case PDF:
120 return default_type;
121 default:
122 switch (ch)
123 {
124 case LRM_CHAR:
125 case RLM_CHAR:
126 return default_type;
127 default:
128 if (override == L2R) /* X6 */
129 return STRONG_L;
130 else if (override == R2L)
131 return STRONG_R;
132 else
133 abort (); /* can't happen: handled above */
134 }
135 }
136 }
137
138 static void
139 bidi_check_type (bidi_type_t type)
140 {
141 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
142 abort ();
143 }
144
145 /* Given a bidi TYPE of a character, return its category. */
146 static inline bidi_category_t
147 bidi_get_category (bidi_type_t type)
148 {
149 switch (type)
150 {
151 case UNKNOWN_BT:
152 return UNKNOWN_BC;
153 case STRONG_L:
154 case STRONG_R:
155 case STRONG_AL:
156 case LRE:
157 case LRO:
158 case RLE:
159 case RLO:
160 return STRONG;
161 case PDF: /* ??? really?? */
162 case WEAK_EN:
163 case WEAK_ES:
164 case WEAK_ET:
165 case WEAK_AN:
166 case WEAK_CS:
167 case WEAK_NSM:
168 case WEAK_BN:
169 return WEAK;
170 case NEUTRAL_B:
171 case NEUTRAL_S:
172 case NEUTRAL_WS:
173 case NEUTRAL_ON:
174 return NEUTRAL;
175 default:
176 abort ();
177 }
178 }
179
180 /* Return the mirrored character of C, if it has one. If C has no
181 mirrored counterpart, return C.
182 Note: The conditions in UAX#9 clause L4 regarding the surrounding
183 context must be tested by the caller. */
184 int
185 bidi_mirror_char (int c)
186 {
187 Lisp_Object val;
188
189 if (c == BIDI_EOB)
190 return c;
191 if (c < 0 || c > MAX_CHAR)
192 abort ();
193
194 val = CHAR_TABLE_REF (bidi_mirror_table, c);
195 if (INTEGERP (val))
196 {
197 int v = XINT (val);
198
199 if (v < 0 || v > MAX_CHAR)
200 abort ();
201
202 return v;
203 }
204
205 return c;
206 }
207
208 /* Determine the start-of-run (sor) directional type given the two
209 embedding levels on either side of the run boundary. Also, update
210 the saved info about previously seen characters, since that info is
211 generally valid for a single level run. */
212 static inline void
213 bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
214 {
215 int higher_level = level_before > level_after ? level_before : level_after;
216
217 /* The prev_was_pdf gork is required for when we have several PDFs
218 in a row. In that case, we want to compute the sor type for the
219 next level run only once: when we see the first PDF. That's
220 because the sor type depends only on the higher of the two levels
221 that we find on the two sides of the level boundary (see UAX#9,
222 clause X10), and so we don't need to know the final embedding
223 level to which we descend after processing all the PDFs. */
224 if (!bidi_it->prev_was_pdf || level_before < level_after)
225 /* FIXME: should the default sor direction be user selectable? */
226 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
227 if (level_before > level_after)
228 bidi_it->prev_was_pdf = 1;
229
230 bidi_it->prev.type = UNKNOWN_BT;
231 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
232 bidi_it->last_strong.orig_type = UNKNOWN_BT;
233 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
234 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
235 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
236 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
237 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
238 bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
239 }
240
241 /* Push the current embedding level and override status; reset the
242 current level to LEVEL and the current override status to OVERRIDE. */
243 static inline void
244 bidi_push_embedding_level (struct bidi_it *bidi_it,
245 int level, bidi_dir_t override)
246 {
247 bidi_it->stack_idx++;
248 xassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
249 bidi_it->level_stack[bidi_it->stack_idx].level = level;
250 bidi_it->level_stack[bidi_it->stack_idx].override = override;
251 }
252
253 /* Pop the embedding level and directional override status from the
254 stack, and return the new level. */
255 static inline int
256 bidi_pop_embedding_level (struct bidi_it *bidi_it)
257 {
258 /* UAX#9 says to ignore invalid PDFs. */
259 if (bidi_it->stack_idx > 0)
260 bidi_it->stack_idx--;
261 return bidi_it->level_stack[bidi_it->stack_idx].level;
262 }
263
264 /* Record in SAVED_INFO the information about the current character. */
265 static inline void
266 bidi_remember_char (struct bidi_saved_info *saved_info,
267 struct bidi_it *bidi_it)
268 {
269 saved_info->charpos = bidi_it->charpos;
270 saved_info->bytepos = bidi_it->bytepos;
271 saved_info->type = bidi_it->type;
272 bidi_check_type (bidi_it->type);
273 saved_info->type_after_w1 = bidi_it->type_after_w1;
274 bidi_check_type (bidi_it->type_after_w1);
275 saved_info->orig_type = bidi_it->orig_type;
276 bidi_check_type (bidi_it->orig_type);
277 }
278
279 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
280 copies the part of the level stack that is actually in use. */
281 static inline void
282 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
283 {
284 int i;
285
286 /* Copy everything except the level stack and beyond. */
287 memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
288
289 /* Copy the active part of the level stack. */
290 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
291 for (i = 1; i <= from->stack_idx; i++)
292 to->level_stack[i] = from->level_stack[i];
293 }
294
295 \f
296 /***********************************************************************
297 Caching the bidi iterator states
298 ***********************************************************************/
299
300 #define BIDI_CACHE_CHUNK 200
301 static struct bidi_it *bidi_cache;
302 static ptrdiff_t bidi_cache_size = 0;
303 enum { elsz = sizeof (struct bidi_it) };
304 static ptrdiff_t bidi_cache_idx; /* next unused cache slot */
305 static ptrdiff_t bidi_cache_last_idx; /* slot of last cache hit */
306 static ptrdiff_t bidi_cache_start = 0; /* start of cache for this
307 "stack" level */
308
309 /* 5-slot stack for saving the start of the previous level of the
310 cache. xdisp.c maintains a 5-slot stack for its iterator state,
311 and we need the same size of our stack. */
312 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
313 static int bidi_cache_sp;
314
315 /* Size of header used by bidi_shelve_cache. */
316 enum
317 {
318 bidi_shelve_header_size =
319 (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
320 + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
321 + sizeof (bidi_cache_last_idx))
322 };
323
324 /* Reset the cache state to the empty state. We only reset the part
325 of the cache relevant to iteration of the current object. Previous
326 objects, which are pushed on the display iterator's stack, are left
327 intact. This is called when the cached information is no more
328 useful for the current iteration, e.g. when we were reseated to a
329 new position on the same object. */
330 static inline void
331 bidi_cache_reset (void)
332 {
333 bidi_cache_idx = bidi_cache_start;
334 bidi_cache_last_idx = -1;
335 }
336
337 /* Shrink the cache to its minimal size. Called when we init the bidi
338 iterator for reordering a buffer or a string that does not come
339 from display properties, because that means all the previously
340 cached info is of no further use. */
341 static inline void
342 bidi_cache_shrink (void)
343 {
344 if (bidi_cache_size > BIDI_CACHE_CHUNK)
345 {
346 bidi_cache_size = BIDI_CACHE_CHUNK;
347 bidi_cache =
348 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
349 }
350 bidi_cache_reset ();
351 }
352
353 static inline void
354 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
355 {
356 int current_scan_dir = bidi_it->scan_dir;
357
358 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
359 abort ();
360
361 bidi_copy_it (bidi_it, &bidi_cache[idx]);
362 bidi_it->scan_dir = current_scan_dir;
363 bidi_cache_last_idx = idx;
364 }
365
366 /* Find a cached state with a given CHARPOS and resolved embedding
367 level less or equal to LEVEL. if LEVEL is -1, disregard the
368 resolved levels in cached states. DIR, if non-zero, means search
369 in that direction from the last cache hit. */
370 static inline ptrdiff_t
371 bidi_cache_search (EMACS_INT charpos, int level, int dir)
372 {
373 ptrdiff_t i, i_start;
374
375 if (bidi_cache_idx > bidi_cache_start)
376 {
377 if (bidi_cache_last_idx == -1)
378 bidi_cache_last_idx = bidi_cache_idx - 1;
379 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
380 {
381 dir = -1;
382 i_start = bidi_cache_last_idx - 1;
383 }
384 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
385 + bidi_cache[bidi_cache_last_idx].nchars - 1))
386 {
387 dir = 1;
388 i_start = bidi_cache_last_idx + 1;
389 }
390 else if (dir)
391 i_start = bidi_cache_last_idx;
392 else
393 {
394 dir = -1;
395 i_start = bidi_cache_idx - 1;
396 }
397
398 if (dir < 0)
399 {
400 /* Linear search for now; FIXME! */
401 for (i = i_start; i >= bidi_cache_start; i--)
402 if (bidi_cache[i].charpos <= charpos
403 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
404 && (level == -1 || bidi_cache[i].resolved_level <= level))
405 return i;
406 }
407 else
408 {
409 for (i = i_start; i < bidi_cache_idx; i++)
410 if (bidi_cache[i].charpos <= charpos
411 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
412 && (level == -1 || bidi_cache[i].resolved_level <= level))
413 return i;
414 }
415 }
416
417 return -1;
418 }
419
420 /* Find a cached state where the resolved level changes to a value
421 that is lower than LEVEL, and return its cache slot index. DIR is
422 the direction to search, starting with the last used cache slot.
423 If DIR is zero, we search backwards from the last occupied cache
424 slot. BEFORE, if non-zero, means return the index of the slot that
425 is ``before'' the level change in the search direction. That is,
426 given the cached levels like this:
427
428 1122333442211
429 AB C
430
431 and assuming we are at the position cached at the slot marked with
432 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
433 index of slot B or A, depending whether BEFORE is, respectively,
434 non-zero or zero. */
435 static ptrdiff_t
436 bidi_cache_find_level_change (int level, int dir, int before)
437 {
438 if (bidi_cache_idx)
439 {
440 ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
441 int incr = before ? 1 : 0;
442
443 xassert (!dir || bidi_cache_last_idx >= 0);
444
445 if (!dir)
446 dir = -1;
447 else if (!incr)
448 i += dir;
449
450 if (dir < 0)
451 {
452 while (i >= bidi_cache_start + incr)
453 {
454 if (bidi_cache[i - incr].resolved_level >= 0
455 && bidi_cache[i - incr].resolved_level < level)
456 return i;
457 i--;
458 }
459 }
460 else
461 {
462 while (i < bidi_cache_idx - incr)
463 {
464 if (bidi_cache[i + incr].resolved_level >= 0
465 && bidi_cache[i + incr].resolved_level < level)
466 return i;
467 i++;
468 }
469 }
470 }
471
472 return -1;
473 }
474
475 static inline void
476 bidi_cache_ensure_space (ptrdiff_t idx)
477 {
478 /* Enlarge the cache as needed. */
479 if (idx >= bidi_cache_size)
480 {
481 ptrdiff_t new_size;
482
483 /* The bidi cache cannot be larger than the largest Lisp string
484 or buffer. */
485 ptrdiff_t string_or_buffer_bound =
486 max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
487
488 /* Also, it cannot be larger than what C can represent. */
489 ptrdiff_t c_bound =
490 (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
491
492 if (min (string_or_buffer_bound, c_bound) <= idx)
493 memory_full (SIZE_MAX);
494 new_size = idx - idx % BIDI_CACHE_CHUNK + BIDI_CACHE_CHUNK;
495 bidi_cache = (struct bidi_it *) xrealloc (bidi_cache, new_size * elsz);
496 bidi_cache_size = new_size;
497 }
498 }
499
500 static inline void
501 bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
502 {
503 ptrdiff_t idx;
504
505 /* We should never cache on backward scans. */
506 if (bidi_it->scan_dir == -1)
507 abort ();
508 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
509
510 if (idx < 0)
511 {
512 idx = bidi_cache_idx;
513 bidi_cache_ensure_space (idx);
514 /* Character positions should correspond to cache positions 1:1.
515 If we are outside the range of cached positions, the cache is
516 useless and must be reset. */
517 if (idx > bidi_cache_start &&
518 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
519 + bidi_cache[idx - 1].nchars)
520 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
521 {
522 bidi_cache_reset ();
523 idx = bidi_cache_start;
524 }
525 if (bidi_it->nchars <= 0)
526 abort ();
527 bidi_copy_it (&bidi_cache[idx], bidi_it);
528 if (!resolved)
529 bidi_cache[idx].resolved_level = -1;
530 }
531 else
532 {
533 /* Copy only the members which could have changed, to avoid
534 costly copying of the entire struct. */
535 bidi_cache[idx].type = bidi_it->type;
536 bidi_check_type (bidi_it->type);
537 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
538 bidi_check_type (bidi_it->type_after_w1);
539 if (resolved)
540 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
541 else
542 bidi_cache[idx].resolved_level = -1;
543 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
544 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
545 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
546 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
547 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
548 }
549
550 bidi_cache_last_idx = idx;
551 if (idx >= bidi_cache_idx)
552 bidi_cache_idx = idx + 1;
553 }
554
555 static inline bidi_type_t
556 bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
557 {
558 ptrdiff_t i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
559
560 if (i >= bidi_cache_start)
561 {
562 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
563
564 bidi_copy_it (bidi_it, &bidi_cache[i]);
565 bidi_cache_last_idx = i;
566 /* Don't let scan direction from from the cached state override
567 the current scan direction. */
568 bidi_it->scan_dir = current_scan_dir;
569 return bidi_it->type;
570 }
571
572 return UNKNOWN_BT;
573 }
574
575 static inline int
576 bidi_peek_at_next_level (struct bidi_it *bidi_it)
577 {
578 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
579 abort ();
580 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
581 }
582
583 \f
584 /***********************************************************************
585 Pushing and popping the bidi iterator state
586 ***********************************************************************/
587
588 /* Push the bidi iterator state in preparation for reordering a
589 different object, e.g. display string found at certain buffer
590 position. Pushing the bidi iterator boils down to saving its
591 entire state on the cache and starting a new cache "stacked" on top
592 of the current cache. */
593 void
594 bidi_push_it (struct bidi_it *bidi_it)
595 {
596 /* Save the current iterator state in its entirety after the last
597 used cache slot. */
598 bidi_cache_ensure_space (bidi_cache_idx);
599 memcpy (&bidi_cache[bidi_cache_idx++], bidi_it, sizeof (struct bidi_it));
600
601 /* Push the current cache start onto the stack. */
602 xassert (bidi_cache_sp < IT_STACK_SIZE);
603 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
604
605 /* Start a new level of cache, and make it empty. */
606 bidi_cache_start = bidi_cache_idx;
607 bidi_cache_last_idx = -1;
608 }
609
610 /* Restore the iterator state saved by bidi_push_it and return the
611 cache to the corresponding state. */
612 void
613 bidi_pop_it (struct bidi_it *bidi_it)
614 {
615 if (bidi_cache_start <= 0)
616 abort ();
617
618 /* Reset the next free cache slot index to what it was before the
619 call to bidi_push_it. */
620 bidi_cache_idx = bidi_cache_start - 1;
621
622 /* Restore the bidi iterator state saved in the cache. */
623 memcpy (bidi_it, &bidi_cache[bidi_cache_idx], sizeof (struct bidi_it));
624
625 /* Pop the previous cache start from the stack. */
626 if (bidi_cache_sp <= 0)
627 abort ();
628 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
629
630 /* Invalidate the last-used cache slot data. */
631 bidi_cache_last_idx = -1;
632 }
633
634 /* Stash away a copy of the cache and its control variables. */
635 void *
636 bidi_shelve_cache (void)
637 {
638 unsigned char *databuf;
639
640 if (bidi_cache_idx == 0)
641 return NULL;
642
643 databuf = xmalloc (bidi_shelve_header_size
644 + bidi_cache_idx * sizeof (struct bidi_it));
645 memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
646 memcpy (databuf + sizeof (bidi_cache_idx),
647 bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
648 memcpy (databuf + sizeof (bidi_cache_idx)
649 + bidi_cache_idx * sizeof (struct bidi_it),
650 bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
651 memcpy (databuf + sizeof (bidi_cache_idx)
652 + bidi_cache_idx * sizeof (struct bidi_it)
653 + sizeof (bidi_cache_start_stack),
654 &bidi_cache_sp, sizeof (bidi_cache_sp));
655 memcpy (databuf + sizeof (bidi_cache_idx)
656 + bidi_cache_idx * sizeof (struct bidi_it)
657 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
658 &bidi_cache_start, sizeof (bidi_cache_start));
659 memcpy (databuf + sizeof (bidi_cache_idx)
660 + bidi_cache_idx * sizeof (struct bidi_it)
661 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
662 + sizeof (bidi_cache_start),
663 &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
664
665 return databuf;
666 }
667
668 /* Restore the cache state from a copy stashed away by bidi_shelve_cache. */
669 void
670 bidi_unshelve_cache (void *databuf)
671 {
672 unsigned char *p = databuf;
673
674 if (!p)
675 {
676 /* A NULL pointer means an empty cache. */
677 bidi_cache_start = 0;
678 bidi_cache_sp = 0;
679 bidi_cache_reset ();
680 }
681 else
682 {
683 memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
684 bidi_cache_ensure_space (bidi_cache_idx);
685 memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
686 bidi_cache_idx * sizeof (struct bidi_it));
687 memcpy (bidi_cache_start_stack,
688 p + sizeof (bidi_cache_idx)
689 + bidi_cache_idx * sizeof (struct bidi_it),
690 sizeof (bidi_cache_start_stack));
691 memcpy (&bidi_cache_sp,
692 p + sizeof (bidi_cache_idx)
693 + bidi_cache_idx * sizeof (struct bidi_it)
694 + sizeof (bidi_cache_start_stack),
695 sizeof (bidi_cache_sp));
696 memcpy (&bidi_cache_start,
697 p + sizeof (bidi_cache_idx)
698 + bidi_cache_idx * sizeof (struct bidi_it)
699 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
700 sizeof (bidi_cache_start));
701 memcpy (&bidi_cache_last_idx,
702 p + sizeof (bidi_cache_idx)
703 + bidi_cache_idx * sizeof (struct bidi_it)
704 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
705 + sizeof (bidi_cache_start),
706 sizeof (bidi_cache_last_idx));
707
708 xfree (p);
709 }
710 }
711
712 \f
713 /***********************************************************************
714 Initialization
715 ***********************************************************************/
716 static void
717 bidi_initialize (void)
718 {
719
720 #include "biditype.h"
721 #include "bidimirror.h"
722
723 int i;
724
725 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
726 staticpro (&bidi_type_table);
727
728 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
729 char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
730 make_number (bidi_type[i].type));
731
732 bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
733 staticpro (&bidi_mirror_table);
734
735 for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
736 char_table_set (bidi_mirror_table, bidi_mirror[i].from,
737 make_number (bidi_mirror[i].to));
738
739 Qparagraph_start = intern ("paragraph-start");
740 staticpro (&Qparagraph_start);
741 paragraph_start_re = Fsymbol_value (Qparagraph_start);
742 if (!STRINGP (paragraph_start_re))
743 paragraph_start_re = build_string ("\f\\|[ \t]*$");
744 staticpro (&paragraph_start_re);
745 Qparagraph_separate = intern ("paragraph-separate");
746 staticpro (&Qparagraph_separate);
747 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
748 if (!STRINGP (paragraph_separate_re))
749 paragraph_separate_re = build_string ("[ \t\f]*$");
750 staticpro (&paragraph_separate_re);
751
752 bidi_cache_sp = 0;
753
754 bidi_initialized = 1;
755 }
756
757 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
758 end. */
759 static inline void
760 bidi_set_paragraph_end (struct bidi_it *bidi_it)
761 {
762 bidi_it->invalid_levels = 0;
763 bidi_it->invalid_rl_levels = -1;
764 bidi_it->stack_idx = 0;
765 bidi_it->resolved_level = bidi_it->level_stack[0].level;
766 }
767
768 /* Initialize the bidi iterator from buffer/string position CHARPOS. */
769 void
770 bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p,
771 struct bidi_it *bidi_it)
772 {
773 if (! bidi_initialized)
774 bidi_initialize ();
775 if (charpos >= 0)
776 bidi_it->charpos = charpos;
777 if (bytepos >= 0)
778 bidi_it->bytepos = bytepos;
779 bidi_it->frame_window_p = frame_window_p;
780 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
781 bidi_it->first_elt = 1;
782 bidi_set_paragraph_end (bidi_it);
783 bidi_it->new_paragraph = 1;
784 bidi_it->separator_limit = -1;
785 bidi_it->type = NEUTRAL_B;
786 bidi_it->type_after_w1 = NEUTRAL_B;
787 bidi_it->orig_type = NEUTRAL_B;
788 bidi_it->prev_was_pdf = 0;
789 bidi_it->prev.type = bidi_it->prev.type_after_w1 =
790 bidi_it->prev.orig_type = UNKNOWN_BT;
791 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
792 bidi_it->last_strong.orig_type = UNKNOWN_BT;
793 bidi_it->next_for_neutral.charpos = -1;
794 bidi_it->next_for_neutral.type =
795 bidi_it->next_for_neutral.type_after_w1 =
796 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
797 bidi_it->prev_for_neutral.charpos = -1;
798 bidi_it->prev_for_neutral.type =
799 bidi_it->prev_for_neutral.type_after_w1 =
800 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
801 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
802 bidi_it->disp_pos = -1; /* invalid/unknown */
803 /* We can only shrink the cache if we are at the bottom level of its
804 "stack". */
805 if (bidi_cache_start == 0)
806 bidi_cache_shrink ();
807 else
808 bidi_cache_reset ();
809 }
810
811 /* Perform initializations for reordering a new line of bidi text. */
812 static void
813 bidi_line_init (struct bidi_it *bidi_it)
814 {
815 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
816 bidi_it->resolved_level = bidi_it->level_stack[0].level;
817 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
818 bidi_it->invalid_levels = 0;
819 bidi_it->invalid_rl_levels = -1;
820 bidi_it->next_en_pos = -1;
821 bidi_it->next_for_ws.type = UNKNOWN_BT;
822 bidi_set_sor_type (bidi_it,
823 bidi_it->paragraph_dir == R2L ? 1 : 0,
824 bidi_it->level_stack[0].level); /* X10 */
825
826 bidi_cache_reset ();
827 }
828
829 \f
830 /***********************************************************************
831 Fetching characters
832 ***********************************************************************/
833
834 /* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
835 are zero-based character positions in S, BEGBYTE is byte position
836 corresponding to BEG. UNIBYTE, if non-zero, means S is a unibyte
837 string. */
838 static inline EMACS_INT
839 bidi_count_bytes (const unsigned char *s, const EMACS_INT beg,
840 const EMACS_INT begbyte, const EMACS_INT end, int unibyte)
841 {
842 EMACS_INT pos = beg;
843 const unsigned char *p = s + begbyte, *start = p;
844
845 if (unibyte)
846 p = s + end;
847 else
848 {
849 if (!CHAR_HEAD_P (*p))
850 abort ();
851
852 while (pos < end)
853 {
854 p += BYTES_BY_CHAR_HEAD (*p);
855 pos++;
856 }
857 }
858
859 return p - start;
860 }
861
862 /* Fetch and returns the character at byte position BYTEPOS. If S is
863 non-NULL, fetch the character from string S; otherwise fetch the
864 character from the current buffer. UNIBYTE non-zero means S is a
865 unibyte string. */
866 static inline int
867 bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s, int unibyte)
868 {
869 if (s)
870 {
871 if (unibyte)
872 return s[bytepos];
873 else
874 return STRING_CHAR (s + bytepos);
875 }
876 else
877 return FETCH_MULTIBYTE_CHAR (bytepos);
878 }
879
880 /* Fetch and return the character at BYTEPOS/CHARPOS. If that
881 character is covered by a display string, treat the entire run of
882 covered characters as a single character u+FFFC, and return their
883 combined length in CH_LEN and NCHARS. DISP_POS specifies the
884 character position of the next display string, or -1 if not yet
885 computed. When the next character is at or beyond that position,
886 the function updates DISP_POS with the position of the next display
887 string. STRING->s is the C string to iterate, or NULL if iterating
888 over a buffer or a Lisp string; in the latter case, STRING->lstring
889 is the Lisp string. */
890 static inline int
891 bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
892 struct bidi_string_data *string,
893 int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars)
894 {
895 int ch;
896 EMACS_INT endpos =
897 (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
898 struct text_pos pos;
899
900 /* If we got past the last known position of display string, compute
901 the position of the next one. That position could be at CHARPOS. */
902 if (charpos < endpos && charpos > *disp_pos)
903 {
904 SET_TEXT_POS (pos, charpos, bytepos);
905 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p);
906 }
907
908 /* Fetch the character at BYTEPOS. */
909 if (charpos >= endpos)
910 {
911 ch = BIDI_EOB;
912 *ch_len = 1;
913 *nchars = 1;
914 *disp_pos = endpos;
915 }
916 else if (charpos >= *disp_pos)
917 {
918 EMACS_INT disp_end_pos;
919
920 /* We don't expect to find ourselves in the middle of a display
921 property. Hopefully, it will never be needed. */
922 if (charpos > *disp_pos)
923 abort ();
924 /* Return the Unicode Object Replacement Character to represent
925 the entire run of characters covered by the display string. */
926 ch = 0xFFFC;
927 disp_end_pos = compute_display_string_end (*disp_pos, string);
928 *nchars = disp_end_pos - *disp_pos;
929 if (*nchars <= 0)
930 abort ();
931 if (string->s)
932 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
933 disp_end_pos, string->unibyte);
934 else if (STRINGP (string->lstring))
935 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
936 bytepos, disp_end_pos, string->unibyte);
937 else
938 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
939 }
940 else
941 {
942 if (string->s)
943 {
944 int len;
945
946 if (!string->unibyte)
947 {
948 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
949 *ch_len = len;
950 }
951 else
952 {
953 ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
954 *ch_len = 1;
955 }
956 }
957 else if (STRINGP (string->lstring))
958 {
959 int len;
960
961 if (!string->unibyte)
962 {
963 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
964 len);
965 *ch_len = len;
966 }
967 else
968 {
969 ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
970 *ch_len = 1;
971 }
972 }
973 else
974 {
975 ch = FETCH_MULTIBYTE_CHAR (bytepos);
976 *ch_len = CHAR_BYTES (ch);
977 }
978 *nchars = 1;
979 }
980
981 /* If we just entered a run of characters covered by a display
982 string, compute the position of the next display string. */
983 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos)
984 {
985 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
986 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p);
987 }
988
989 return ch;
990 }
991
992 \f
993 /***********************************************************************
994 Determining paragraph direction
995 ***********************************************************************/
996
997 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
998 Value is the non-negative length of the paragraph separator
999 following the buffer position, -1 if position is at the beginning
1000 of a new paragraph, or -2 if position is neither at beginning nor
1001 at end of a paragraph. */
1002 static EMACS_INT
1003 bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
1004 {
1005 Lisp_Object sep_re;
1006 Lisp_Object start_re;
1007 EMACS_INT val;
1008
1009 sep_re = paragraph_separate_re;
1010 start_re = paragraph_start_re;
1011
1012 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1013 if (val < 0)
1014 {
1015 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1016 val = -1;
1017 else
1018 val = -2;
1019 }
1020
1021 return val;
1022 }
1023
1024 /* Find the beginning of this paragraph by looking back in the buffer.
1025 Value is the byte position of the paragraph's beginning. */
1026 static EMACS_INT
1027 bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
1028 {
1029 Lisp_Object re = paragraph_start_re;
1030 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
1031
1032 while (pos_byte > BEGV_BYTE
1033 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1034 {
1035 /* FIXME: What if the paragraph beginning is covered by a
1036 display string? And what if a display string covering some
1037 of the text over which we scan back includes
1038 paragraph_start_re? */
1039 pos = find_next_newline_no_quit (pos - 1, -1);
1040 pos_byte = CHAR_TO_BYTE (pos);
1041 }
1042 return pos_byte;
1043 }
1044
1045 /* Determine the base direction, a.k.a. base embedding level, of the
1046 paragraph we are about to iterate through. If DIR is either L2R or
1047 R2L, just use that. Otherwise, determine the paragraph direction
1048 from the first strong directional character of the paragraph.
1049
1050 NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
1051 has no strong directional characters and both DIR and
1052 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1053 in the buffer until a paragraph is found with a strong character,
1054 or until hitting BEGV. In the latter case, fall back to L2R. This
1055 flag is used in current-bidi-paragraph-direction.
1056
1057 Note that this function gives the paragraph separator the same
1058 direction as the preceding paragraph, even though Emacs generally
1059 views the separartor as not belonging to any paragraph. */
1060 void
1061 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
1062 {
1063 EMACS_INT bytepos = bidi_it->bytepos;
1064 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
1065 EMACS_INT pstartbyte;
1066 /* Note that begbyte is a byte position, while end is a character
1067 position. Yes, this is ugly, but we are trying to avoid costly
1068 calls to BYTE_TO_CHAR and its ilk. */
1069 EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE;
1070 EMACS_INT end = string_p ? bidi_it->string.schars : ZV;
1071
1072 /* Special case for an empty buffer. */
1073 if (bytepos == begbyte && bidi_it->charpos == end)
1074 dir = L2R;
1075 /* We should never be called at EOB or before BEGV. */
1076 else if (bidi_it->charpos >= end || bytepos < begbyte)
1077 abort ();
1078
1079 if (dir == L2R)
1080 {
1081 bidi_it->paragraph_dir = L2R;
1082 bidi_it->new_paragraph = 0;
1083 }
1084 else if (dir == R2L)
1085 {
1086 bidi_it->paragraph_dir = R2L;
1087 bidi_it->new_paragraph = 0;
1088 }
1089 else if (dir == NEUTRAL_DIR) /* P2 */
1090 {
1091 int ch;
1092 EMACS_INT ch_len, nchars;
1093 EMACS_INT pos, disp_pos = -1;
1094 bidi_type_t type;
1095 const unsigned char *s;
1096
1097 if (!bidi_initialized)
1098 bidi_initialize ();
1099
1100 /* If we are inside a paragraph separator, we are just waiting
1101 for the separator to be exhausted; use the previous paragraph
1102 direction. But don't do that if we have been just reseated,
1103 because we need to reinitialize below in that case. */
1104 if (!bidi_it->first_elt
1105 && bidi_it->charpos < bidi_it->separator_limit)
1106 return;
1107
1108 /* If we are on a newline, get past it to where the next
1109 paragraph might start. But don't do that at BEGV since then
1110 we are potentially in a new paragraph that doesn't yet
1111 exist. */
1112 pos = bidi_it->charpos;
1113 s = STRINGP (bidi_it->string.lstring) ?
1114 SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1115 if (bytepos > begbyte
1116 && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1117 {
1118 bytepos++;
1119 pos++;
1120 }
1121
1122 /* We are either at the beginning of a paragraph or in the
1123 middle of it. Find where this paragraph starts. */
1124 if (string_p)
1125 {
1126 /* We don't support changes of paragraph direction inside a
1127 string. It is treated as a single paragraph. */
1128 pstartbyte = 0;
1129 }
1130 else
1131 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1132 bidi_it->separator_limit = -1;
1133 bidi_it->new_paragraph = 0;
1134
1135 /* The following loop is run more than once only if NO_DEFAULT_P
1136 is non-zero, and only if we are iterating on a buffer. */
1137 do {
1138 bytepos = pstartbyte;
1139 if (!string_p)
1140 pos = BYTE_TO_CHAR (bytepos);
1141 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
1142 bidi_it->frame_window_p, &ch_len, &nchars);
1143 type = bidi_get_type (ch, NEUTRAL_DIR);
1144
1145 for (pos += nchars, bytepos += ch_len;
1146 /* NOTE: UAX#9 says to search only for L, AL, or R types
1147 of characters, and ignore RLE, RLO, LRE, and LRO.
1148 However, I'm not sure it makes sense to omit those 4;
1149 should try with and without that to see the effect. */
1150 (bidi_get_category (type) != STRONG)
1151 || (bidi_ignore_explicit_marks_for_paragraph_level
1152 && (type == RLE || type == RLO
1153 || type == LRE || type == LRO));
1154 type = bidi_get_type (ch, NEUTRAL_DIR))
1155 {
1156 if (pos >= end)
1157 {
1158 /* Pretend there's a paragraph separator at end of
1159 buffer/string. */
1160 type = NEUTRAL_B;
1161 break;
1162 }
1163 if (!string_p
1164 && type == NEUTRAL_B
1165 && bidi_at_paragraph_end (pos, bytepos) >= -1)
1166 break;
1167 /* Fetch next character and advance to get past it. */
1168 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
1169 bidi_it->frame_window_p, &ch_len, &nchars);
1170 pos += nchars;
1171 bytepos += ch_len;
1172 }
1173 if (type == STRONG_R || type == STRONG_AL) /* P3 */
1174 bidi_it->paragraph_dir = R2L;
1175 else if (type == STRONG_L)
1176 bidi_it->paragraph_dir = L2R;
1177 if (!string_p
1178 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1179 {
1180 /* If this paragraph is at BEGV, default to L2R. */
1181 if (pstartbyte == BEGV_BYTE)
1182 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1183 else
1184 {
1185 EMACS_INT prevpbyte = pstartbyte;
1186 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1187
1188 /* Find the beginning of the previous paragraph, if any. */
1189 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1190 {
1191 /* FXIME: What if p is covered by a display
1192 string? See also a FIXME inside
1193 bidi_find_paragraph_start. */
1194 p--;
1195 pbyte = CHAR_TO_BYTE (p);
1196 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1197 }
1198 pstartbyte = prevpbyte;
1199 }
1200 }
1201 } while (!string_p
1202 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1203 }
1204 else
1205 abort ();
1206
1207 /* Contrary to UAX#9 clause P3, we only default the paragraph
1208 direction to L2R if we have no previous usable paragraph
1209 direction. This is allowed by the HL1 clause. */
1210 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1211 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1212 if (bidi_it->paragraph_dir == R2L)
1213 bidi_it->level_stack[0].level = 1;
1214 else
1215 bidi_it->level_stack[0].level = 0;
1216
1217 bidi_line_init (bidi_it);
1218 }
1219
1220 \f
1221 /***********************************************************************
1222 Resolving explicit and implicit levels.
1223 The rest of this file constitutes the core of the UBA implementation.
1224 ***********************************************************************/
1225
1226 static inline int
1227 bidi_explicit_dir_char (int ch)
1228 {
1229 bidi_type_t ch_type;
1230
1231 if (!bidi_initialized)
1232 abort ();
1233 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1234 return (ch_type == LRE || ch_type == LRO
1235 || ch_type == RLE || ch_type == RLO
1236 || ch_type == PDF);
1237 }
1238
1239 /* A helper function for bidi_resolve_explicit. It advances to the
1240 next character in logical order and determines the new embedding
1241 level and directional override, but does not take into account
1242 empty embeddings. */
1243 static int
1244 bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1245 {
1246 int curchar;
1247 bidi_type_t type;
1248 int current_level;
1249 int new_level;
1250 bidi_dir_t override;
1251 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
1252
1253 /* If reseat()'ed, don't advance, so as to start iteration from the
1254 position where we were reseated. bidi_it->bytepos can be less
1255 than BEGV_BYTE after reseat to BEGV. */
1256 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1257 || bidi_it->first_elt)
1258 {
1259 bidi_it->first_elt = 0;
1260 if (string_p)
1261 {
1262 const unsigned char *p =
1263 STRINGP (bidi_it->string.lstring)
1264 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1265
1266 if (bidi_it->charpos < 0)
1267 bidi_it->charpos = 0;
1268 bidi_it->bytepos = bidi_count_bytes (p, 0, 0, bidi_it->charpos,
1269 bidi_it->string.unibyte);
1270 }
1271 else
1272 {
1273 if (bidi_it->charpos < BEGV)
1274 bidi_it->charpos = BEGV;
1275 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1276 }
1277 }
1278 /* Don't move at end of buffer/string. */
1279 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1280 {
1281 /* Advance to the next character, skipping characters covered by
1282 display strings (nchars > 1). */
1283 if (bidi_it->nchars <= 0)
1284 abort ();
1285 bidi_it->charpos += bidi_it->nchars;
1286 if (bidi_it->ch_len == 0)
1287 abort ();
1288 bidi_it->bytepos += bidi_it->ch_len;
1289 }
1290
1291 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1292 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1293 new_level = current_level;
1294
1295 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1296 {
1297 curchar = BIDI_EOB;
1298 bidi_it->ch_len = 1;
1299 bidi_it->nchars = 1;
1300 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1301 }
1302 else
1303 {
1304 /* Fetch the character at BYTEPOS. If it is covered by a
1305 display string, treat the entire run of covered characters as
1306 a single character u+FFFC. */
1307 curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
1308 &bidi_it->disp_pos, &bidi_it->string,
1309 bidi_it->frame_window_p,
1310 &bidi_it->ch_len, &bidi_it->nchars);
1311 }
1312 bidi_it->ch = curchar;
1313
1314 /* Don't apply directional override here, as all the types we handle
1315 below will not be affected by the override anyway, and we need
1316 the original type unaltered. The override will be applied in
1317 bidi_resolve_weak. */
1318 type = bidi_get_type (curchar, NEUTRAL_DIR);
1319 bidi_it->orig_type = type;
1320 bidi_check_type (bidi_it->orig_type);
1321
1322 if (type != PDF)
1323 bidi_it->prev_was_pdf = 0;
1324
1325 bidi_it->type_after_w1 = UNKNOWN_BT;
1326
1327 switch (type)
1328 {
1329 case RLE: /* X2 */
1330 case RLO: /* X4 */
1331 bidi_it->type_after_w1 = type;
1332 bidi_check_type (bidi_it->type_after_w1);
1333 type = WEAK_BN; /* X9/Retaining */
1334 if (bidi_it->ignore_bn_limit <= -1)
1335 {
1336 if (current_level <= BIDI_MAXLEVEL - 4)
1337 {
1338 /* Compute the least odd embedding level greater than
1339 the current level. */
1340 new_level = ((current_level + 1) & ~1) + 1;
1341 if (bidi_it->type_after_w1 == RLE)
1342 override = NEUTRAL_DIR;
1343 else
1344 override = R2L;
1345 if (current_level == BIDI_MAXLEVEL - 4)
1346 bidi_it->invalid_rl_levels = 0;
1347 bidi_push_embedding_level (bidi_it, new_level, override);
1348 }
1349 else
1350 {
1351 bidi_it->invalid_levels++;
1352 /* See the commentary about invalid_rl_levels below. */
1353 if (bidi_it->invalid_rl_levels < 0)
1354 bidi_it->invalid_rl_levels = 0;
1355 bidi_it->invalid_rl_levels++;
1356 }
1357 }
1358 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1359 || bidi_it->next_en_pos > bidi_it->charpos)
1360 type = WEAK_EN;
1361 break;
1362 case LRE: /* X3 */
1363 case LRO: /* X5 */
1364 bidi_it->type_after_w1 = type;
1365 bidi_check_type (bidi_it->type_after_w1);
1366 type = WEAK_BN; /* X9/Retaining */
1367 if (bidi_it->ignore_bn_limit <= -1)
1368 {
1369 if (current_level <= BIDI_MAXLEVEL - 5)
1370 {
1371 /* Compute the least even embedding level greater than
1372 the current level. */
1373 new_level = ((current_level + 2) & ~1);
1374 if (bidi_it->type_after_w1 == LRE)
1375 override = NEUTRAL_DIR;
1376 else
1377 override = L2R;
1378 bidi_push_embedding_level (bidi_it, new_level, override);
1379 }
1380 else
1381 {
1382 bidi_it->invalid_levels++;
1383 /* invalid_rl_levels counts invalid levels encountered
1384 while the embedding level was already too high for
1385 LRE/LRO, but not for RLE/RLO. That is because
1386 there may be exactly one PDF which we should not
1387 ignore even though invalid_levels is non-zero.
1388 invalid_rl_levels helps to know what PDF is
1389 that. */
1390 if (bidi_it->invalid_rl_levels >= 0)
1391 bidi_it->invalid_rl_levels++;
1392 }
1393 }
1394 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1395 || bidi_it->next_en_pos > bidi_it->charpos)
1396 type = WEAK_EN;
1397 break;
1398 case PDF: /* X7 */
1399 bidi_it->type_after_w1 = type;
1400 bidi_check_type (bidi_it->type_after_w1);
1401 type = WEAK_BN; /* X9/Retaining */
1402 if (bidi_it->ignore_bn_limit <= -1)
1403 {
1404 if (!bidi_it->invalid_rl_levels)
1405 {
1406 new_level = bidi_pop_embedding_level (bidi_it);
1407 bidi_it->invalid_rl_levels = -1;
1408 if (bidi_it->invalid_levels)
1409 bidi_it->invalid_levels--;
1410 /* else nothing: UAX#9 says to ignore invalid PDFs */
1411 }
1412 if (!bidi_it->invalid_levels)
1413 new_level = bidi_pop_embedding_level (bidi_it);
1414 else
1415 {
1416 bidi_it->invalid_levels--;
1417 bidi_it->invalid_rl_levels--;
1418 }
1419 }
1420 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1421 || bidi_it->next_en_pos > bidi_it->charpos)
1422 type = WEAK_EN;
1423 break;
1424 default:
1425 /* Nothing. */
1426 break;
1427 }
1428
1429 bidi_it->type = type;
1430 bidi_check_type (bidi_it->type);
1431
1432 return new_level;
1433 }
1434
1435 /* Given an iterator state in BIDI_IT, advance one character position
1436 in the buffer/string to the next character (in the logical order),
1437 resolve any explicit embeddings and directional overrides, and
1438 return the embedding level of the character after resolving
1439 explicit directives and ignoring empty embeddings. */
1440 static int
1441 bidi_resolve_explicit (struct bidi_it *bidi_it)
1442 {
1443 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1444 int new_level = bidi_resolve_explicit_1 (bidi_it);
1445 EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
1446 const unsigned char *s = STRINGP (bidi_it->string.lstring)
1447 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1448
1449 if (prev_level < new_level
1450 && bidi_it->type == WEAK_BN
1451 && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1452 && bidi_it->charpos < eob /* not already at EOB */
1453 && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1454 + bidi_it->ch_len, s,
1455 bidi_it->string.unibyte)))
1456 {
1457 /* Avoid pushing and popping embedding levels if the level run
1458 is empty, as this breaks level runs where it shouldn't.
1459 UAX#9 removes all the explicit embedding and override codes,
1460 so empty embeddings disappear without a trace. We need to
1461 behave as if we did the same. */
1462 struct bidi_it saved_it;
1463 int level = prev_level;
1464
1465 bidi_copy_it (&saved_it, bidi_it);
1466
1467 while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1468 + bidi_it->ch_len, s,
1469 bidi_it->string.unibyte)))
1470 {
1471 /* This advances to the next character, skipping any
1472 characters covered by display strings. */
1473 level = bidi_resolve_explicit_1 (bidi_it);
1474 /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1475 a pointer to its data is no longer valid. */
1476 if (STRINGP (bidi_it->string.lstring))
1477 s = SDATA (bidi_it->string.lstring);
1478 }
1479
1480 if (bidi_it->nchars <= 0)
1481 abort ();
1482 if (level == prev_level) /* empty embedding */
1483 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
1484 else /* this embedding is non-empty */
1485 saved_it.ignore_bn_limit = -2;
1486
1487 bidi_copy_it (bidi_it, &saved_it);
1488 if (bidi_it->ignore_bn_limit > -1)
1489 {
1490 /* We pushed a level, but we shouldn't have. Undo that. */
1491 if (!bidi_it->invalid_rl_levels)
1492 {
1493 new_level = bidi_pop_embedding_level (bidi_it);
1494 bidi_it->invalid_rl_levels = -1;
1495 if (bidi_it->invalid_levels)
1496 bidi_it->invalid_levels--;
1497 }
1498 if (!bidi_it->invalid_levels)
1499 new_level = bidi_pop_embedding_level (bidi_it);
1500 else
1501 {
1502 bidi_it->invalid_levels--;
1503 bidi_it->invalid_rl_levels--;
1504 }
1505 }
1506 }
1507
1508 if (bidi_it->type == NEUTRAL_B) /* X8 */
1509 {
1510 bidi_set_paragraph_end (bidi_it);
1511 /* This is needed by bidi_resolve_weak below, and in L1. */
1512 bidi_it->type_after_w1 = bidi_it->type;
1513 bidi_check_type (bidi_it->type_after_w1);
1514 }
1515
1516 return new_level;
1517 }
1518
1519 /* Advance in the buffer/string, resolve weak types and return the
1520 type of the next character after weak type resolution. */
1521 static bidi_type_t
1522 bidi_resolve_weak (struct bidi_it *bidi_it)
1523 {
1524 bidi_type_t type;
1525 bidi_dir_t override;
1526 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1527 int new_level = bidi_resolve_explicit (bidi_it);
1528 int next_char;
1529 bidi_type_t type_of_next;
1530 struct bidi_it saved_it;
1531 EMACS_INT eob =
1532 (STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1533 ? bidi_it->string.schars : ZV;
1534
1535 type = bidi_it->type;
1536 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1537
1538 if (type == UNKNOWN_BT
1539 || type == LRE
1540 || type == LRO
1541 || type == RLE
1542 || type == RLO
1543 || type == PDF)
1544 abort ();
1545
1546 if (new_level != prev_level
1547 || bidi_it->type == NEUTRAL_B)
1548 {
1549 /* We've got a new embedding level run, compute the directional
1550 type of sor and initialize per-run variables (UAX#9, clause
1551 X10). */
1552 bidi_set_sor_type (bidi_it, prev_level, new_level);
1553 }
1554 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1555 || type == WEAK_BN || type == STRONG_AL)
1556 bidi_it->type_after_w1 = type; /* needed in L1 */
1557 bidi_check_type (bidi_it->type_after_w1);
1558
1559 /* Level and directional override status are already recorded in
1560 bidi_it, and do not need any change; see X6. */
1561 if (override == R2L) /* X6 */
1562 type = STRONG_R;
1563 else if (override == L2R)
1564 type = STRONG_L;
1565 else
1566 {
1567 if (type == WEAK_NSM) /* W1 */
1568 {
1569 /* Note that we don't need to consider the case where the
1570 prev character has its type overridden by an RLO or LRO,
1571 because then either the type of this NSM would have been
1572 also overridden, or the previous character is outside the
1573 current level run, and thus not relevant to this NSM.
1574 This is why NSM gets the type_after_w1 of the previous
1575 character. */
1576 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1577 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1578 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
1579 type = bidi_it->prev.type_after_w1;
1580 else if (bidi_it->sor == R2L)
1581 type = STRONG_R;
1582 else if (bidi_it->sor == L2R)
1583 type = STRONG_L;
1584 else /* shouldn't happen! */
1585 abort ();
1586 }
1587 if (type == WEAK_EN /* W2 */
1588 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1589 type = WEAK_AN;
1590 else if (type == STRONG_AL) /* W3 */
1591 type = STRONG_R;
1592 else if ((type == WEAK_ES /* W4 */
1593 && bidi_it->prev.type_after_w1 == WEAK_EN
1594 && bidi_it->prev.orig_type == WEAK_EN)
1595 || (type == WEAK_CS
1596 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1597 && bidi_it->prev.orig_type == WEAK_EN)
1598 || bidi_it->prev.type_after_w1 == WEAK_AN)))
1599 {
1600 const unsigned char *s =
1601 STRINGP (bidi_it->string.lstring)
1602 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1603
1604 next_char =
1605 bidi_it->charpos + bidi_it->nchars >= eob
1606 ? BIDI_EOB
1607 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
1608 bidi_it->string.unibyte);
1609 type_of_next = bidi_get_type (next_char, override);
1610
1611 if (type_of_next == WEAK_BN
1612 || bidi_explicit_dir_char (next_char))
1613 {
1614 bidi_copy_it (&saved_it, bidi_it);
1615 while (bidi_resolve_explicit (bidi_it) == new_level
1616 && bidi_it->type == WEAK_BN)
1617 ;
1618 type_of_next = bidi_it->type;
1619 bidi_copy_it (bidi_it, &saved_it);
1620 }
1621
1622 /* If the next character is EN, but the last strong-type
1623 character is AL, that next EN will be changed to AN when
1624 we process it in W2 above. So in that case, this ES
1625 should not be changed into EN. */
1626 if (type == WEAK_ES
1627 && type_of_next == WEAK_EN
1628 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1629 type = WEAK_EN;
1630 else if (type == WEAK_CS)
1631 {
1632 if (bidi_it->prev.type_after_w1 == WEAK_AN
1633 && (type_of_next == WEAK_AN
1634 /* If the next character is EN, but the last
1635 strong-type character is AL, EN will be later
1636 changed to AN when we process it in W2 above.
1637 So in that case, this ES should not be
1638 changed into EN. */
1639 || (type_of_next == WEAK_EN
1640 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1641 type = WEAK_AN;
1642 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1643 && type_of_next == WEAK_EN
1644 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1645 type = WEAK_EN;
1646 }
1647 }
1648 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1649 || type == WEAK_BN) /* W5/Retaining */
1650 {
1651 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1652 || bidi_it->next_en_pos > bidi_it->charpos)
1653 type = WEAK_EN;
1654 else /* W5: ET/BN with EN after it. */
1655 {
1656 EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
1657 const unsigned char *s =
1658 STRINGP (bidi_it->string.lstring)
1659 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1660
1661 if (bidi_it->nchars <= 0)
1662 abort ();
1663 next_char =
1664 bidi_it->charpos + bidi_it->nchars >= eob
1665 ? BIDI_EOB
1666 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
1667 bidi_it->string.unibyte);
1668 type_of_next = bidi_get_type (next_char, override);
1669
1670 if (type_of_next == WEAK_ET
1671 || type_of_next == WEAK_BN
1672 || bidi_explicit_dir_char (next_char))
1673 {
1674 bidi_copy_it (&saved_it, bidi_it);
1675 while (bidi_resolve_explicit (bidi_it) == new_level
1676 && (bidi_it->type == WEAK_BN
1677 || bidi_it->type == WEAK_ET))
1678 ;
1679 type_of_next = bidi_it->type;
1680 en_pos = bidi_it->charpos;
1681 bidi_copy_it (bidi_it, &saved_it);
1682 }
1683 if (type_of_next == WEAK_EN)
1684 {
1685 /* If the last strong character is AL, the EN we've
1686 found will become AN when we get to it (W2). */
1687 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1688 {
1689 type = WEAK_EN;
1690 /* Remember this EN position, to speed up processing
1691 of the next ETs. */
1692 bidi_it->next_en_pos = en_pos;
1693 }
1694 else if (type == WEAK_BN)
1695 type = NEUTRAL_ON; /* W6/Retaining */
1696 }
1697 }
1698 }
1699 }
1700
1701 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
1702 || (type == WEAK_BN
1703 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1704 || bidi_it->prev.type_after_w1 == WEAK_ES
1705 || bidi_it->prev.type_after_w1 == WEAK_ET)))
1706 type = NEUTRAL_ON;
1707
1708 /* Store the type we've got so far, before we clobber it with strong
1709 types in W7 and while resolving neutral types. But leave alone
1710 the original types that were recorded above, because we will need
1711 them for the L1 clause. */
1712 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1713 bidi_it->type_after_w1 = type;
1714 bidi_check_type (bidi_it->type_after_w1);
1715
1716 if (type == WEAK_EN) /* W7 */
1717 {
1718 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
1719 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1720 type = STRONG_L;
1721 }
1722
1723 bidi_it->type = type;
1724 bidi_check_type (bidi_it->type);
1725 return type;
1726 }
1727
1728 /* Resolve the type of a neutral character according to the type of
1729 surrounding strong text and the current embedding level. */
1730 static inline bidi_type_t
1731 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1732 {
1733 /* N1: European and Arabic numbers are treated as though they were R. */
1734 if (next_type == WEAK_EN || next_type == WEAK_AN)
1735 next_type = STRONG_R;
1736 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1737 prev_type = STRONG_R;
1738
1739 if (next_type == prev_type) /* N1 */
1740 return next_type;
1741 else if ((lev & 1) == 0) /* N2 */
1742 return STRONG_L;
1743 else
1744 return STRONG_R;
1745 }
1746
1747 static bidi_type_t
1748 bidi_resolve_neutral (struct bidi_it *bidi_it)
1749 {
1750 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1751 bidi_type_t type = bidi_resolve_weak (bidi_it);
1752 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1753
1754 if (!(type == STRONG_R
1755 || type == STRONG_L
1756 || type == WEAK_BN
1757 || type == WEAK_EN
1758 || type == WEAK_AN
1759 || type == NEUTRAL_B
1760 || type == NEUTRAL_S
1761 || type == NEUTRAL_WS
1762 || type == NEUTRAL_ON))
1763 abort ();
1764
1765 if (bidi_get_category (type) == NEUTRAL
1766 || (type == WEAK_BN && prev_level == current_level))
1767 {
1768 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1769 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1770 bidi_it->next_for_neutral.type,
1771 current_level);
1772 else
1773 {
1774 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1775 the assumption of batch-style processing; see clauses W4,
1776 W5, and especially N1, which require to look far forward
1777 (as well as back) in the buffer/string. May the fleas of
1778 a thousand camels infest the armpits of those who design
1779 supposedly general-purpose algorithms by looking at their
1780 own implementations, and fail to consider other possible
1781 implementations! */
1782 struct bidi_it saved_it;
1783 bidi_type_t next_type;
1784
1785 if (bidi_it->scan_dir == -1)
1786 abort ();
1787
1788 bidi_copy_it (&saved_it, bidi_it);
1789 /* Scan the text forward until we find the first non-neutral
1790 character, and then use that to resolve the neutral we
1791 are dealing with now. We also cache the scanned iterator
1792 states, to salvage some of the effort later. */
1793 bidi_cache_iterator_state (bidi_it, 0);
1794 do {
1795 /* Record the info about the previous character, so that
1796 it will be cached below with this state. */
1797 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1798 && bidi_it->type != WEAK_BN)
1799 bidi_remember_char (&bidi_it->prev, bidi_it);
1800 type = bidi_resolve_weak (bidi_it);
1801 /* Paragraph separators have their levels fully resolved
1802 at this point, so cache them as resolved. */
1803 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1804 /* FIXME: implement L1 here, by testing for a newline and
1805 resetting the level for any sequence of whitespace
1806 characters adjacent to it. */
1807 } while (!(type == NEUTRAL_B
1808 || (type != WEAK_BN
1809 && bidi_get_category (type) != NEUTRAL)
1810 /* This is all per level run, so stop when we
1811 reach the end of this level run. */
1812 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1813 current_level));
1814
1815 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1816
1817 switch (type)
1818 {
1819 case STRONG_L:
1820 case STRONG_R:
1821 case STRONG_AL:
1822 next_type = type;
1823 break;
1824 case WEAK_EN:
1825 case WEAK_AN:
1826 /* N1: ``European and Arabic numbers are treated as
1827 though they were R.'' */
1828 next_type = STRONG_R;
1829 saved_it.next_for_neutral.type = STRONG_R;
1830 break;
1831 case WEAK_BN:
1832 if (!bidi_explicit_dir_char (bidi_it->ch))
1833 abort (); /* can't happen: BNs are skipped */
1834 /* FALLTHROUGH */
1835 case NEUTRAL_B:
1836 /* Marched all the way to the end of this level run.
1837 We need to use the eor type, whose information is
1838 stored by bidi_set_sor_type in the prev_for_neutral
1839 member. */
1840 if (saved_it.type != WEAK_BN
1841 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
1842 {
1843 next_type = bidi_it->prev_for_neutral.type;
1844 saved_it.next_for_neutral.type = next_type;
1845 bidi_check_type (next_type);
1846 }
1847 else
1848 {
1849 /* This is a BN which does not adjoin neutrals.
1850 Leave its type alone. */
1851 bidi_copy_it (bidi_it, &saved_it);
1852 return bidi_it->type;
1853 }
1854 break;
1855 default:
1856 abort ();
1857 }
1858 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1859 next_type, current_level);
1860 saved_it.type = type;
1861 bidi_check_type (type);
1862 bidi_copy_it (bidi_it, &saved_it);
1863 }
1864 }
1865 return type;
1866 }
1867
1868 /* Given an iterator state in BIDI_IT, advance one character position
1869 in the buffer/string to the next character (in the logical order),
1870 resolve the bidi type of that next character, and return that
1871 type. */
1872 static bidi_type_t
1873 bidi_type_of_next_char (struct bidi_it *bidi_it)
1874 {
1875 bidi_type_t type;
1876
1877 /* This should always be called during a forward scan. */
1878 if (bidi_it->scan_dir != 1)
1879 abort ();
1880
1881 /* Reset the limit until which to ignore BNs if we step out of the
1882 area where we found only empty levels. */
1883 if ((bidi_it->ignore_bn_limit > -1
1884 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1885 || (bidi_it->ignore_bn_limit == -2
1886 && !bidi_explicit_dir_char (bidi_it->ch)))
1887 bidi_it->ignore_bn_limit = -1;
1888
1889 type = bidi_resolve_neutral (bidi_it);
1890
1891 return type;
1892 }
1893
1894 /* Given an iterator state BIDI_IT, advance one character position in
1895 the buffer/string to the next character (in the current scan
1896 direction), resolve the embedding and implicit levels of that next
1897 character, and return the resulting level. */
1898 static int
1899 bidi_level_of_next_char (struct bidi_it *bidi_it)
1900 {
1901 bidi_type_t type;
1902 int level, prev_level = -1;
1903 struct bidi_saved_info next_for_neutral;
1904 EMACS_INT next_char_pos = -2;
1905
1906 if (bidi_it->scan_dir == 1)
1907 {
1908 EMACS_INT eob =
1909 (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1910 ? bidi_it->string.schars : ZV;
1911
1912 /* There's no sense in trying to advance if we hit end of text. */
1913 if (bidi_it->charpos >= eob)
1914 return bidi_it->resolved_level;
1915
1916 /* Record the info about the previous character. */
1917 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1918 && bidi_it->type != WEAK_BN)
1919 bidi_remember_char (&bidi_it->prev, bidi_it);
1920 if (bidi_it->type_after_w1 == STRONG_R
1921 || bidi_it->type_after_w1 == STRONG_L
1922 || bidi_it->type_after_w1 == STRONG_AL)
1923 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1924 /* FIXME: it sounds like we don't need both prev and
1925 prev_for_neutral members, but I'm leaving them both for now. */
1926 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1927 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1928 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1929
1930 /* If we overstepped the characters used for resolving neutrals
1931 and whitespace, invalidate their info in the iterator. */
1932 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1933 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1934 if (bidi_it->next_en_pos >= 0
1935 && bidi_it->charpos >= bidi_it->next_en_pos)
1936 bidi_it->next_en_pos = -1;
1937 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1938 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1939 bidi_it->next_for_ws.type = UNKNOWN_BT;
1940
1941 /* This must be taken before we fill the iterator with the info
1942 about the next char. If we scan backwards, the iterator
1943 state must be already cached, so there's no need to know the
1944 embedding level of the previous character, since we will be
1945 returning to our caller shortly. */
1946 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1947 }
1948 next_for_neutral = bidi_it->next_for_neutral;
1949
1950 /* Perhaps the character we want is already cached. If it is, the
1951 call to bidi_cache_find below will return a type other than
1952 UNKNOWN_BT. */
1953 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
1954 {
1955 int bob =
1956 (bidi_it->string.s || STRINGP (bidi_it->string.lstring)) ? 0 : 1;
1957
1958 if (bidi_it->scan_dir > 0)
1959 {
1960 if (bidi_it->nchars <= 0)
1961 abort ();
1962 next_char_pos = bidi_it->charpos + bidi_it->nchars;
1963 }
1964 else if (bidi_it->charpos >= bob)
1965 /* Implementation note: we allow next_char_pos to be as low as
1966 0 for buffers or -1 for strings, and that is okay because
1967 that's the "position" of the sentinel iterator state we
1968 cached at the beginning of the iteration. */
1969 next_char_pos = bidi_it->charpos - 1;
1970 if (next_char_pos >= bob - 1)
1971 type = bidi_cache_find (next_char_pos, -1, bidi_it);
1972 else
1973 type = UNKNOWN_BT;
1974 }
1975 else
1976 type = UNKNOWN_BT;
1977 if (type != UNKNOWN_BT)
1978 {
1979 /* Don't lose the information for resolving neutrals! The
1980 cached states could have been cached before their
1981 next_for_neutral member was computed. If we are on our way
1982 forward, we can simply take the info from the previous
1983 state. */
1984 if (bidi_it->scan_dir == 1
1985 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1986 bidi_it->next_for_neutral = next_for_neutral;
1987
1988 /* If resolved_level is -1, it means this state was cached
1989 before it was completely resolved, so we cannot return
1990 it. */
1991 if (bidi_it->resolved_level != -1)
1992 return bidi_it->resolved_level;
1993 }
1994 if (bidi_it->scan_dir == -1)
1995 /* If we are going backwards, the iterator state is already cached
1996 from previous scans, and should be fully resolved. */
1997 abort ();
1998
1999 if (type == UNKNOWN_BT)
2000 type = bidi_type_of_next_char (bidi_it);
2001
2002 if (type == NEUTRAL_B)
2003 return bidi_it->resolved_level;
2004
2005 level = bidi_it->level_stack[bidi_it->stack_idx].level;
2006 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
2007 || (type == WEAK_BN && prev_level == level))
2008 {
2009 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
2010 abort ();
2011
2012 /* If the cached state shows a neutral character, it was not
2013 resolved by bidi_resolve_neutral, so do it now. */
2014 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2015 bidi_it->next_for_neutral.type,
2016 level);
2017 }
2018
2019 if (!(type == STRONG_R
2020 || type == STRONG_L
2021 || type == WEAK_BN
2022 || type == WEAK_EN
2023 || type == WEAK_AN))
2024 abort ();
2025 bidi_it->type = type;
2026 bidi_check_type (bidi_it->type);
2027
2028 /* For L1 below, we need to know, for each WS character, whether
2029 it belongs to a sequence of WS characters preceding a newline
2030 or a TAB or a paragraph separator. */
2031 if (bidi_it->orig_type == NEUTRAL_WS
2032 && bidi_it->next_for_ws.type == UNKNOWN_BT)
2033 {
2034 int ch;
2035 EMACS_INT clen = bidi_it->ch_len;
2036 EMACS_INT bpos = bidi_it->bytepos;
2037 EMACS_INT cpos = bidi_it->charpos;
2038 EMACS_INT disp_pos = bidi_it->disp_pos;
2039 EMACS_INT nc = bidi_it->nchars;
2040 struct bidi_string_data bs = bidi_it->string;
2041 bidi_type_t chtype;
2042 int fwp = bidi_it->frame_window_p;
2043
2044 if (bidi_it->nchars <= 0)
2045 abort ();
2046 do {
2047 ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &bs, fwp,
2048 &clen, &nc);
2049 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
2050 chtype = NEUTRAL_B;
2051 else
2052 chtype = bidi_get_type (ch, NEUTRAL_DIR);
2053 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
2054 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
2055 bidi_it->next_for_ws.type = chtype;
2056 bidi_check_type (bidi_it->next_for_ws.type);
2057 bidi_it->next_for_ws.charpos = cpos;
2058 bidi_it->next_for_ws.bytepos = bpos;
2059 }
2060
2061 /* Resolve implicit levels, with a twist: PDFs get the embedding
2062 level of the enbedding they terminate. See below for the
2063 reason. */
2064 if (bidi_it->orig_type == PDF
2065 /* Don't do this if this formatting code didn't change the
2066 embedding level due to invalid or empty embeddings. */
2067 && prev_level != level)
2068 {
2069 /* Don't look in UAX#9 for the reason for this: it's our own
2070 private quirk. The reason is that we want the formatting
2071 codes to be delivered so that they bracket the text of their
2072 embedding. For example, given the text
2073
2074 {RLO}teST{PDF}
2075
2076 we want it to be displayed as
2077
2078 {PDF}STet{RLO}
2079
2080 not as
2081
2082 STet{RLO}{PDF}
2083
2084 which will result because we bump up the embedding level as
2085 soon as we see the RLO and pop it as soon as we see the PDF,
2086 so RLO itself has the same embedding level as "teST", and
2087 thus would be normally delivered last, just before the PDF.
2088 The switch below fiddles with the level of PDF so that this
2089 ugly side effect does not happen.
2090
2091 (This is, of course, only important if the formatting codes
2092 are actually displayed, but Emacs does need to display them
2093 if the user wants to.) */
2094 level = prev_level;
2095 }
2096 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
2097 || bidi_it->orig_type == NEUTRAL_S
2098 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
2099 /* || bidi_it->ch == LINESEP_CHAR */
2100 || (bidi_it->orig_type == NEUTRAL_WS
2101 && (bidi_it->next_for_ws.type == NEUTRAL_B
2102 || bidi_it->next_for_ws.type == NEUTRAL_S)))
2103 level = bidi_it->level_stack[0].level;
2104 else if ((level & 1) == 0) /* I1 */
2105 {
2106 if (type == STRONG_R)
2107 level++;
2108 else if (type == WEAK_EN || type == WEAK_AN)
2109 level += 2;
2110 }
2111 else /* I2 */
2112 {
2113 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
2114 level++;
2115 }
2116
2117 bidi_it->resolved_level = level;
2118 return level;
2119 }
2120
2121 /* Move to the other edge of a level given by LEVEL. If END_FLAG is
2122 non-zero, we are at the end of a level, and we need to prepare to
2123 resume the scan of the lower level.
2124
2125 If this level's other edge is cached, we simply jump to it, filling
2126 the iterator structure with the iterator state on the other edge.
2127 Otherwise, we walk the buffer or string until we come back to the
2128 same level as LEVEL.
2129
2130 Note: we are not talking here about a ``level run'' in the UAX#9
2131 sense of the term, but rather about a ``level'' which includes
2132 all the levels higher than it. In other words, given the levels
2133 like this:
2134
2135 11111112222222333333334443343222222111111112223322111
2136 A B C
2137
2138 and assuming we are at point A scanning left to right, this
2139 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
2140 at point B. */
2141 static void
2142 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
2143 {
2144 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
2145 ptrdiff_t idx;
2146
2147 /* Try the cache first. */
2148 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
2149 >= bidi_cache_start)
2150 bidi_cache_fetch_state (idx, bidi_it);
2151 else
2152 {
2153 int new_level;
2154
2155 if (end_flag)
2156 abort (); /* if we are at end of level, its edges must be cached */
2157
2158 bidi_cache_iterator_state (bidi_it, 1);
2159 do {
2160 new_level = bidi_level_of_next_char (bidi_it);
2161 bidi_cache_iterator_state (bidi_it, 1);
2162 } while (new_level >= level);
2163 }
2164 }
2165
2166 void
2167 bidi_move_to_visually_next (struct bidi_it *bidi_it)
2168 {
2169 int old_level, new_level, next_level;
2170 struct bidi_it sentinel;
2171 struct gcpro gcpro1;
2172
2173 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
2174 abort ();
2175
2176 if (bidi_it->scan_dir == 0)
2177 {
2178 bidi_it->scan_dir = 1; /* default to logical order */
2179 }
2180
2181 /* The code below can call eval, and thus cause GC. If we are
2182 iterating a Lisp string, make sure it won't be GCed. */
2183 if (STRINGP (bidi_it->string.lstring))
2184 GCPRO1 (bidi_it->string.lstring);
2185
2186 /* If we just passed a newline, initialize for the next line. */
2187 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
2188 bidi_line_init (bidi_it);
2189
2190 /* Prepare the sentinel iterator state, and cache it. When we bump
2191 into it, scanning backwards, we'll know that the last non-base
2192 level is exhausted. */
2193 if (bidi_cache_idx == bidi_cache_start)
2194 {
2195 bidi_copy_it (&sentinel, bidi_it);
2196 if (bidi_it->first_elt)
2197 {
2198 sentinel.charpos--; /* cached charpos needs to be monotonic */
2199 sentinel.bytepos--;
2200 sentinel.ch = '\n'; /* doesn't matter, but why not? */
2201 sentinel.ch_len = 1;
2202 sentinel.nchars = 1;
2203 }
2204 bidi_cache_iterator_state (&sentinel, 1);
2205 }
2206
2207 old_level = bidi_it->resolved_level;
2208 new_level = bidi_level_of_next_char (bidi_it);
2209
2210 /* Reordering of resolved levels (clause L2) is implemented by
2211 jumping to the other edge of the level and flipping direction of
2212 scanning the text whenever we find a level change. */
2213 if (new_level != old_level)
2214 {
2215 int ascending = new_level > old_level;
2216 int level_to_search = ascending ? old_level + 1 : old_level;
2217 int incr = ascending ? 1 : -1;
2218 int expected_next_level = old_level + incr;
2219
2220 /* Jump (or walk) to the other edge of this level. */
2221 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2222 /* Switch scan direction and peek at the next character in the
2223 new direction. */
2224 bidi_it->scan_dir = -bidi_it->scan_dir;
2225
2226 /* The following loop handles the case where the resolved level
2227 jumps by more than one. This is typical for numbers inside a
2228 run of text with left-to-right embedding direction, but can
2229 also happen in other situations. In those cases the decision
2230 where to continue after a level change, and in what direction,
2231 is tricky. For example, given a text like below:
2232
2233 abcdefgh
2234 11336622
2235
2236 (where the numbers below the text show the resolved levels),
2237 the result of reordering according to UAX#9 should be this:
2238
2239 efdcghba
2240
2241 This is implemented by the loop below which flips direction
2242 and jumps to the other edge of the level each time it finds
2243 the new level not to be the expected one. The expected level
2244 is always one more or one less than the previous one. */
2245 next_level = bidi_peek_at_next_level (bidi_it);
2246 while (next_level != expected_next_level)
2247 {
2248 expected_next_level += incr;
2249 level_to_search += incr;
2250 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2251 bidi_it->scan_dir = -bidi_it->scan_dir;
2252 next_level = bidi_peek_at_next_level (bidi_it);
2253 }
2254
2255 /* Finally, deliver the next character in the new direction. */
2256 next_level = bidi_level_of_next_char (bidi_it);
2257 }
2258
2259 /* Take note when we have just processed the newline that precedes
2260 the end of the paragraph. The next time we are about to be
2261 called, set_iterator_to_next will automatically reinit the
2262 paragraph direction, if needed. We do this at the newline before
2263 the paragraph separator, because the next character might not be
2264 the first character of the next paragraph, due to the bidi
2265 reordering, whereas we _must_ know the paragraph base direction
2266 _before_ we process the paragraph's text, since the base
2267 direction affects the reordering. */
2268 if (bidi_it->scan_dir == 1 && bidi_it->orig_type == NEUTRAL_B)
2269 {
2270 /* The paragraph direction of the entire string, once
2271 determined, is in effect for the entire string. Setting the
2272 separator limit to the end of the string prevents
2273 bidi_paragraph_init from being called automatically on this
2274 string. */
2275 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2276 bidi_it->separator_limit = bidi_it->string.schars;
2277 else if (bidi_it->bytepos < ZV_BYTE)
2278 {
2279 EMACS_INT sep_len =
2280 bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2281 bidi_it->bytepos + bidi_it->ch_len);
2282 if (bidi_it->nchars <= 0)
2283 abort ();
2284 if (sep_len >= 0)
2285 {
2286 bidi_it->new_paragraph = 1;
2287 /* Record the buffer position of the last character of the
2288 paragraph separator. */
2289 bidi_it->separator_limit =
2290 bidi_it->charpos + bidi_it->nchars + sep_len;
2291 }
2292 }
2293 }
2294
2295 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
2296 {
2297 /* If we are at paragraph's base embedding level and beyond the
2298 last cached position, the cache's job is done and we can
2299 discard it. */
2300 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
2301 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2302 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
2303 bidi_cache_reset ();
2304 /* But as long as we are caching during forward scan, we must
2305 cache each state, or else the cache integrity will be
2306 compromised: it assumes cached states correspond to buffer
2307 positions 1:1. */
2308 else
2309 bidi_cache_iterator_state (bidi_it, 1);
2310 }
2311
2312 if (STRINGP (bidi_it->string.lstring))
2313 UNGCPRO;
2314 }
2315
2316 /* This is meant to be called from within the debugger, whenever you
2317 wish to examine the cache contents. */
2318 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
2319 void
2320 bidi_dump_cached_states (void)
2321 {
2322 ptrdiff_t i;
2323 int ndigits = 1;
2324
2325 if (bidi_cache_idx == 0)
2326 {
2327 fprintf (stderr, "The cache is empty.\n");
2328 return;
2329 }
2330 fprintf (stderr, "Total of %"pD"d state%s in cache:\n",
2331 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2332
2333 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2334 ndigits++;
2335 fputs ("ch ", stderr);
2336 for (i = 0; i < bidi_cache_idx; i++)
2337 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2338 fputs ("\n", stderr);
2339 fputs ("lvl ", stderr);
2340 for (i = 0; i < bidi_cache_idx; i++)
2341 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2342 fputs ("\n", stderr);
2343 fputs ("pos ", stderr);
2344 for (i = 0; i < bidi_cache_idx; i++)
2345 fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
2346 fputs ("\n", stderr);
2347 }