]> code.delx.au - gnu-emacs/blob - src/bidi.c
Merge from trunk.
[gnu-emacs] / src / bidi.c
1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
2 Copyright (C) 2000-2001, 2004-2005, 2009-2011
3 Free Software Foundation, Inc.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
19
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 (UBA) as per UAX#9, a part of the Unicode Standard.
24
25 Unlike the reference and most other implementations, this one is
26 designed to be called once for every character in the buffer or
27 string.
28
29 The main entry point is bidi_move_to_visually_next. Each time it
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
36 character by resolving their levels on the fly.
37
38 Two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
43 A few auxiliary entry points are used to initialize the bidi
44 iterator for iterating an object (buffer or string), push and pop
45 the bidi iterator state, and save and restore the state of the bidi
46 cache.
47
48 If you want to understand the code, you will have to read it
49 together with the relevant portions of UAX#9. The comments include
50 references to UAX#9 rules, for that very reason.
51
52 A note about references to UAX#9 rules: if the reference says
53 something like "X9/Retaining", it means that you need to refer to
54 rule X9 and to its modifications decribed in the "Implementation
55 Notes" section of UAX#9, under "Retaining Format Codes". */
56
57 #include <config.h>
58 #include <stdio.h>
59 #include <setjmp.h>
60
61 #include "lisp.h"
62 #include "buffer.h"
63 #include "character.h"
64 #include "dispextern.h"
65
66 static int bidi_initialized = 0;
67
68 static Lisp_Object bidi_type_table, bidi_mirror_table;
69
70 #define LRM_CHAR 0x200E
71 #define RLM_CHAR 0x200F
72 #define BIDI_EOB -1
73
74 /* Data type for describing the bidirectional character categories. */
75 typedef enum {
76 UNKNOWN_BC,
77 NEUTRAL,
78 WEAK,
79 STRONG
80 } bidi_category_t;
81
82 /* UAX#9 says to search only for L, AL, or R types of characters, and
83 ignore RLE, RLO, LRE, and LRO, when determining the base paragraph
84 level. Yudit indeed ignores them. This variable is therefore set
85 by default to ignore them, but setting it to zero will take them
86 into account. */
87 extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
88 int bidi_ignore_explicit_marks_for_paragraph_level = 1;
89
90 static Lisp_Object paragraph_start_re, paragraph_separate_re;
91 static Lisp_Object Qparagraph_start, Qparagraph_separate;
92
93 \f
94 /***********************************************************************
95 Utilities
96 ***********************************************************************/
97
98 /* Return the bidi type of a character CH, subject to the current
99 directional OVERRIDE. */
100 static inline bidi_type_t
101 bidi_get_type (int ch, bidi_dir_t override)
102 {
103 bidi_type_t default_type;
104
105 if (ch == BIDI_EOB)
106 return NEUTRAL_B;
107 if (ch < 0 || ch > MAX_CHAR)
108 abort ();
109
110 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
111
112 if (override == NEUTRAL_DIR)
113 return default_type;
114
115 switch (default_type)
116 {
117 /* Although UAX#9 does not tell, it doesn't make sense to
118 override NEUTRAL_B and LRM/RLM characters. */
119 case NEUTRAL_B:
120 case LRE:
121 case LRO:
122 case RLE:
123 case RLO:
124 case PDF:
125 return default_type;
126 default:
127 switch (ch)
128 {
129 case LRM_CHAR:
130 case RLM_CHAR:
131 return default_type;
132 default:
133 if (override == L2R) /* X6 */
134 return STRONG_L;
135 else if (override == R2L)
136 return STRONG_R;
137 else
138 abort (); /* can't happen: handled above */
139 }
140 }
141 }
142
143 static void
144 bidi_check_type (bidi_type_t type)
145 {
146 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
147 abort ();
148 }
149
150 /* Given a bidi TYPE of a character, return its category. */
151 static inline bidi_category_t
152 bidi_get_category (bidi_type_t type)
153 {
154 switch (type)
155 {
156 case UNKNOWN_BT:
157 return UNKNOWN_BC;
158 case STRONG_L:
159 case STRONG_R:
160 case STRONG_AL:
161 case LRE:
162 case LRO:
163 case RLE:
164 case RLO:
165 return STRONG;
166 case PDF: /* ??? really?? */
167 case WEAK_EN:
168 case WEAK_ES:
169 case WEAK_ET:
170 case WEAK_AN:
171 case WEAK_CS:
172 case WEAK_NSM:
173 case WEAK_BN:
174 return WEAK;
175 case NEUTRAL_B:
176 case NEUTRAL_S:
177 case NEUTRAL_WS:
178 case NEUTRAL_ON:
179 return NEUTRAL;
180 default:
181 abort ();
182 }
183 }
184
185 /* Return the mirrored character of C, if it has one. If C has no
186 mirrored counterpart, return C.
187 Note: The conditions in UAX#9 clause L4 regarding the surrounding
188 context must be tested by the caller. */
189 int
190 bidi_mirror_char (int c)
191 {
192 Lisp_Object val;
193
194 if (c == BIDI_EOB)
195 return c;
196 if (c < 0 || c > MAX_CHAR)
197 abort ();
198
199 val = CHAR_TABLE_REF (bidi_mirror_table, c);
200 if (INTEGERP (val))
201 {
202 int v = XINT (val);
203
204 if (v < 0 || v > MAX_CHAR)
205 abort ();
206
207 return v;
208 }
209
210 return c;
211 }
212
213 /* Determine the start-of-run (sor) directional type given the two
214 embedding levels on either side of the run boundary. Also, update
215 the saved info about previously seen characters, since that info is
216 generally valid for a single level run. */
217 static inline void
218 bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
219 {
220 int higher_level = level_before > level_after ? level_before : level_after;
221
222 /* The prev_was_pdf gork is required for when we have several PDFs
223 in a row. In that case, we want to compute the sor type for the
224 next level run only once: when we see the first PDF. That's
225 because the sor type depends only on the higher of the two levels
226 that we find on the two sides of the level boundary (see UAX#9,
227 clause X10), and so we don't need to know the final embedding
228 level to which we descend after processing all the PDFs. */
229 if (!bidi_it->prev_was_pdf || level_before < level_after)
230 /* FIXME: should the default sor direction be user selectable? */
231 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
232 if (level_before > level_after)
233 bidi_it->prev_was_pdf = 1;
234
235 bidi_it->prev.type = UNKNOWN_BT;
236 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
237 bidi_it->last_strong.orig_type = UNKNOWN_BT;
238 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
239 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
240 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
241 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
242 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
243 bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
244 }
245
246 /* Push the current embedding level and override status; reset the
247 current level to LEVEL and the current override status to OVERRIDE. */
248 static inline void
249 bidi_push_embedding_level (struct bidi_it *bidi_it,
250 int level, bidi_dir_t override)
251 {
252 bidi_it->stack_idx++;
253 xassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
254 bidi_it->level_stack[bidi_it->stack_idx].level = level;
255 bidi_it->level_stack[bidi_it->stack_idx].override = override;
256 }
257
258 /* Pop the embedding level and directional override status from the
259 stack, and return the new level. */
260 static inline int
261 bidi_pop_embedding_level (struct bidi_it *bidi_it)
262 {
263 /* UAX#9 says to ignore invalid PDFs. */
264 if (bidi_it->stack_idx > 0)
265 bidi_it->stack_idx--;
266 return bidi_it->level_stack[bidi_it->stack_idx].level;
267 }
268
269 /* Record in SAVED_INFO the information about the current character. */
270 static inline void
271 bidi_remember_char (struct bidi_saved_info *saved_info,
272 struct bidi_it *bidi_it)
273 {
274 saved_info->charpos = bidi_it->charpos;
275 saved_info->bytepos = bidi_it->bytepos;
276 saved_info->type = bidi_it->type;
277 bidi_check_type (bidi_it->type);
278 saved_info->type_after_w1 = bidi_it->type_after_w1;
279 bidi_check_type (bidi_it->type_after_w1);
280 saved_info->orig_type = bidi_it->orig_type;
281 bidi_check_type (bidi_it->orig_type);
282 }
283
284 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
285 copies the part of the level stack that is actually in use. */
286 static inline void
287 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
288 {
289 int i;
290
291 /* Copy everything except the level stack and beyond. */
292 memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
293
294 /* Copy the active part of the level stack. */
295 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
296 for (i = 1; i <= from->stack_idx; i++)
297 to->level_stack[i] = from->level_stack[i];
298 }
299
300 \f
301 /***********************************************************************
302 Caching the bidi iterator states
303 ***********************************************************************/
304
305 #define BIDI_CACHE_CHUNK 200
306 static struct bidi_it *bidi_cache;
307 static ptrdiff_t bidi_cache_size = 0;
308 enum { elsz = sizeof (struct bidi_it) };
309 static ptrdiff_t bidi_cache_idx; /* next unused cache slot */
310 static ptrdiff_t bidi_cache_last_idx; /* slot of last cache hit */
311 static ptrdiff_t bidi_cache_start = 0; /* start of cache for this
312 "stack" level */
313
314 /* 5-slot stack for saving the start of the previous level of the
315 cache. xdisp.c maintains a 5-slot stack for its iterator state,
316 and we need the same size of our stack. */
317 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
318 static int bidi_cache_sp;
319
320 /* Size of header used by bidi_shelve_cache. */
321 enum
322 {
323 bidi_shelve_header_size =
324 (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
325 + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
326 + sizeof (bidi_cache_last_idx))
327 };
328
329 /* Reset the cache state to the empty state. We only reset the part
330 of the cache relevant to iteration of the current object. Previous
331 objects, which are pushed on the display iterator's stack, are left
332 intact. This is called when the cached information is no more
333 useful for the current iteration, e.g. when we were reseated to a
334 new position on the same object. */
335 static inline void
336 bidi_cache_reset (void)
337 {
338 bidi_cache_idx = bidi_cache_start;
339 bidi_cache_last_idx = -1;
340 }
341
342 /* Shrink the cache to its minimal size. Called when we init the bidi
343 iterator for reordering a buffer or a string that does not come
344 from display properties, because that means all the previously
345 cached info is of no further use. */
346 static inline void
347 bidi_cache_shrink (void)
348 {
349 if (bidi_cache_size > BIDI_CACHE_CHUNK)
350 {
351 bidi_cache_size = BIDI_CACHE_CHUNK;
352 bidi_cache =
353 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
354 }
355 bidi_cache_reset ();
356 }
357
358 static inline void
359 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
360 {
361 int current_scan_dir = bidi_it->scan_dir;
362
363 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
364 abort ();
365
366 bidi_copy_it (bidi_it, &bidi_cache[idx]);
367 bidi_it->scan_dir = current_scan_dir;
368 bidi_cache_last_idx = idx;
369 }
370
371 /* Find a cached state with a given CHARPOS and resolved embedding
372 level less or equal to LEVEL. if LEVEL is -1, disregard the
373 resolved levels in cached states. DIR, if non-zero, means search
374 in that direction from the last cache hit. */
375 static inline ptrdiff_t
376 bidi_cache_search (EMACS_INT charpos, int level, int dir)
377 {
378 ptrdiff_t i, i_start;
379
380 if (bidi_cache_idx > bidi_cache_start)
381 {
382 if (bidi_cache_last_idx == -1)
383 bidi_cache_last_idx = bidi_cache_idx - 1;
384 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
385 {
386 dir = -1;
387 i_start = bidi_cache_last_idx - 1;
388 }
389 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
390 + bidi_cache[bidi_cache_last_idx].nchars - 1))
391 {
392 dir = 1;
393 i_start = bidi_cache_last_idx + 1;
394 }
395 else if (dir)
396 i_start = bidi_cache_last_idx;
397 else
398 {
399 dir = -1;
400 i_start = bidi_cache_idx - 1;
401 }
402
403 if (dir < 0)
404 {
405 /* Linear search for now; FIXME! */
406 for (i = i_start; i >= bidi_cache_start; i--)
407 if (bidi_cache[i].charpos <= charpos
408 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
409 && (level == -1 || bidi_cache[i].resolved_level <= level))
410 return i;
411 }
412 else
413 {
414 for (i = i_start; i < bidi_cache_idx; i++)
415 if (bidi_cache[i].charpos <= charpos
416 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
417 && (level == -1 || bidi_cache[i].resolved_level <= level))
418 return i;
419 }
420 }
421
422 return -1;
423 }
424
425 /* Find a cached state where the resolved level changes to a value
426 that is lower than LEVEL, and return its cache slot index. DIR is
427 the direction to search, starting with the last used cache slot.
428 If DIR is zero, we search backwards from the last occupied cache
429 slot. BEFORE, if non-zero, means return the index of the slot that
430 is ``before'' the level change in the search direction. That is,
431 given the cached levels like this:
432
433 1122333442211
434 AB C
435
436 and assuming we are at the position cached at the slot marked with
437 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
438 index of slot B or A, depending whether BEFORE is, respectively,
439 non-zero or zero. */
440 static ptrdiff_t
441 bidi_cache_find_level_change (int level, int dir, int before)
442 {
443 if (bidi_cache_idx)
444 {
445 ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
446 int incr = before ? 1 : 0;
447
448 xassert (!dir || bidi_cache_last_idx >= 0);
449
450 if (!dir)
451 dir = -1;
452 else if (!incr)
453 i += dir;
454
455 if (dir < 0)
456 {
457 while (i >= bidi_cache_start + incr)
458 {
459 if (bidi_cache[i - incr].resolved_level >= 0
460 && bidi_cache[i - incr].resolved_level < level)
461 return i;
462 i--;
463 }
464 }
465 else
466 {
467 while (i < bidi_cache_idx - incr)
468 {
469 if (bidi_cache[i + incr].resolved_level >= 0
470 && bidi_cache[i + incr].resolved_level < level)
471 return i;
472 i++;
473 }
474 }
475 }
476
477 return -1;
478 }
479
480 static inline void
481 bidi_cache_ensure_space (ptrdiff_t idx)
482 {
483 /* Enlarge the cache as needed. */
484 if (idx >= bidi_cache_size)
485 {
486 /* The bidi cache cannot be larger than the largest Lisp string
487 or buffer. */
488 ptrdiff_t string_or_buffer_bound =
489 max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
490
491 /* Also, it cannot be larger than what C can represent. */
492 ptrdiff_t c_bound =
493 (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
494
495 bidi_cache =
496 xpalloc (bidi_cache, &bidi_cache_size,
497 max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
498 min (string_or_buffer_bound, c_bound), elsz);
499 }
500 }
501
502 static inline void
503 bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
504 {
505 ptrdiff_t idx;
506
507 /* We should never cache on backward scans. */
508 if (bidi_it->scan_dir == -1)
509 abort ();
510 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
511
512 if (idx < 0)
513 {
514 idx = bidi_cache_idx;
515 bidi_cache_ensure_space (idx);
516 /* Character positions should correspond to cache positions 1:1.
517 If we are outside the range of cached positions, the cache is
518 useless and must be reset. */
519 if (idx > bidi_cache_start &&
520 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
521 + bidi_cache[idx - 1].nchars)
522 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
523 {
524 bidi_cache_reset ();
525 idx = bidi_cache_start;
526 }
527 if (bidi_it->nchars <= 0)
528 abort ();
529 bidi_copy_it (&bidi_cache[idx], bidi_it);
530 if (!resolved)
531 bidi_cache[idx].resolved_level = -1;
532 }
533 else
534 {
535 /* Copy only the members which could have changed, to avoid
536 costly copying of the entire struct. */
537 bidi_cache[idx].type = bidi_it->type;
538 bidi_check_type (bidi_it->type);
539 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
540 bidi_check_type (bidi_it->type_after_w1);
541 if (resolved)
542 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
543 else
544 bidi_cache[idx].resolved_level = -1;
545 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
546 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
547 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
548 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
549 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
550 }
551
552 bidi_cache_last_idx = idx;
553 if (idx >= bidi_cache_idx)
554 bidi_cache_idx = idx + 1;
555 }
556
557 static inline bidi_type_t
558 bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
559 {
560 ptrdiff_t i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
561
562 if (i >= bidi_cache_start)
563 {
564 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
565
566 bidi_copy_it (bidi_it, &bidi_cache[i]);
567 bidi_cache_last_idx = i;
568 /* Don't let scan direction from from the cached state override
569 the current scan direction. */
570 bidi_it->scan_dir = current_scan_dir;
571 return bidi_it->type;
572 }
573
574 return UNKNOWN_BT;
575 }
576
577 static inline int
578 bidi_peek_at_next_level (struct bidi_it *bidi_it)
579 {
580 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
581 abort ();
582 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
583 }
584
585 \f
586 /***********************************************************************
587 Pushing and popping the bidi iterator state
588 ***********************************************************************/
589
590 /* Push the bidi iterator state in preparation for reordering a
591 different object, e.g. display string found at certain buffer
592 position. Pushing the bidi iterator boils down to saving its
593 entire state on the cache and starting a new cache "stacked" on top
594 of the current cache. */
595 void
596 bidi_push_it (struct bidi_it *bidi_it)
597 {
598 /* Save the current iterator state in its entirety after the last
599 used cache slot. */
600 bidi_cache_ensure_space (bidi_cache_idx);
601 memcpy (&bidi_cache[bidi_cache_idx++], bidi_it, sizeof (struct bidi_it));
602
603 /* Push the current cache start onto the stack. */
604 xassert (bidi_cache_sp < IT_STACK_SIZE);
605 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
606
607 /* Start a new level of cache, and make it empty. */
608 bidi_cache_start = bidi_cache_idx;
609 bidi_cache_last_idx = -1;
610 }
611
612 /* Restore the iterator state saved by bidi_push_it and return the
613 cache to the corresponding state. */
614 void
615 bidi_pop_it (struct bidi_it *bidi_it)
616 {
617 if (bidi_cache_start <= 0)
618 abort ();
619
620 /* Reset the next free cache slot index to what it was before the
621 call to bidi_push_it. */
622 bidi_cache_idx = bidi_cache_start - 1;
623
624 /* Restore the bidi iterator state saved in the cache. */
625 memcpy (bidi_it, &bidi_cache[bidi_cache_idx], sizeof (struct bidi_it));
626
627 /* Pop the previous cache start from the stack. */
628 if (bidi_cache_sp <= 0)
629 abort ();
630 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
631
632 /* Invalidate the last-used cache slot data. */
633 bidi_cache_last_idx = -1;
634 }
635
636 static ptrdiff_t bidi_cache_total_alloc;
637
638 /* Stash away a copy of the cache and its control variables. */
639 void *
640 bidi_shelve_cache (void)
641 {
642 unsigned char *databuf;
643 ptrdiff_t alloc;
644
645 /* Empty cache. */
646 if (bidi_cache_idx == 0)
647 return NULL;
648
649 alloc = (bidi_shelve_header_size
650 + bidi_cache_idx * sizeof (struct bidi_it));
651 databuf = xmalloc (alloc);
652 bidi_cache_total_alloc += alloc;
653
654 memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
655 memcpy (databuf + sizeof (bidi_cache_idx),
656 bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
657 memcpy (databuf + sizeof (bidi_cache_idx)
658 + bidi_cache_idx * sizeof (struct bidi_it),
659 bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
660 memcpy (databuf + sizeof (bidi_cache_idx)
661 + bidi_cache_idx * sizeof (struct bidi_it)
662 + sizeof (bidi_cache_start_stack),
663 &bidi_cache_sp, sizeof (bidi_cache_sp));
664 memcpy (databuf + sizeof (bidi_cache_idx)
665 + bidi_cache_idx * sizeof (struct bidi_it)
666 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
667 &bidi_cache_start, sizeof (bidi_cache_start));
668 memcpy (databuf + sizeof (bidi_cache_idx)
669 + bidi_cache_idx * sizeof (struct bidi_it)
670 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
671 + sizeof (bidi_cache_start),
672 &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
673
674 return databuf;
675 }
676
677 /* Restore the cache state from a copy stashed away by
678 bidi_shelve_cache, and free the buffer used to stash that copy.
679 JUST_FREE non-zero means free the buffer, but don't restore the
680 cache; used when the corresponding iterator is discarded instead of
681 being restored. */
682 void
683 bidi_unshelve_cache (void *databuf, int just_free)
684 {
685 unsigned char *p = databuf;
686
687 if (!p)
688 {
689 if (!just_free)
690 {
691 /* A NULL pointer means an empty cache. */
692 bidi_cache_start = 0;
693 bidi_cache_sp = 0;
694 bidi_cache_reset ();
695 }
696 }
697 else
698 {
699 if (just_free)
700 {
701 ptrdiff_t idx;
702
703 memcpy (&idx, p, sizeof (bidi_cache_idx));
704 bidi_cache_total_alloc -=
705 bidi_shelve_header_size + idx * sizeof (struct bidi_it);
706 }
707 else
708 {
709 memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
710 bidi_cache_ensure_space (bidi_cache_idx);
711 memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
712 bidi_cache_idx * sizeof (struct bidi_it));
713 memcpy (bidi_cache_start_stack,
714 p + sizeof (bidi_cache_idx)
715 + bidi_cache_idx * sizeof (struct bidi_it),
716 sizeof (bidi_cache_start_stack));
717 memcpy (&bidi_cache_sp,
718 p + sizeof (bidi_cache_idx)
719 + bidi_cache_idx * sizeof (struct bidi_it)
720 + sizeof (bidi_cache_start_stack),
721 sizeof (bidi_cache_sp));
722 memcpy (&bidi_cache_start,
723 p + sizeof (bidi_cache_idx)
724 + bidi_cache_idx * sizeof (struct bidi_it)
725 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
726 sizeof (bidi_cache_start));
727 memcpy (&bidi_cache_last_idx,
728 p + sizeof (bidi_cache_idx)
729 + bidi_cache_idx * sizeof (struct bidi_it)
730 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
731 + sizeof (bidi_cache_start),
732 sizeof (bidi_cache_last_idx));
733 bidi_cache_total_alloc -=
734 bidi_shelve_header_size + bidi_cache_idx * sizeof (struct bidi_it);
735 }
736
737 xfree (p);
738 }
739 }
740
741 \f
742 /***********************************************************************
743 Initialization
744 ***********************************************************************/
745 static void
746 bidi_initialize (void)
747 {
748 bidi_type_table = uniprop_table (intern ("bidi-class"));
749 if (NILP (bidi_type_table))
750 abort ();
751 staticpro (&bidi_type_table);
752
753 bidi_mirror_table = uniprop_table (intern ("mirroring"));
754 if (NILP (bidi_mirror_table))
755 abort ();
756 staticpro (&bidi_mirror_table);
757
758 Qparagraph_start = intern ("paragraph-start");
759 staticpro (&Qparagraph_start);
760 paragraph_start_re = Fsymbol_value (Qparagraph_start);
761 if (!STRINGP (paragraph_start_re))
762 paragraph_start_re = build_string ("\f\\|[ \t]*$");
763 staticpro (&paragraph_start_re);
764 Qparagraph_separate = intern ("paragraph-separate");
765 staticpro (&Qparagraph_separate);
766 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
767 if (!STRINGP (paragraph_separate_re))
768 paragraph_separate_re = build_string ("[ \t\f]*$");
769 staticpro (&paragraph_separate_re);
770
771 bidi_cache_sp = 0;
772 bidi_cache_total_alloc = 0;
773
774 bidi_initialized = 1;
775 }
776
777 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
778 end. */
779 static inline void
780 bidi_set_paragraph_end (struct bidi_it *bidi_it)
781 {
782 bidi_it->invalid_levels = 0;
783 bidi_it->invalid_rl_levels = -1;
784 bidi_it->stack_idx = 0;
785 bidi_it->resolved_level = bidi_it->level_stack[0].level;
786 }
787
788 /* Initialize the bidi iterator from buffer/string position CHARPOS. */
789 void
790 bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p,
791 struct bidi_it *bidi_it)
792 {
793 if (! bidi_initialized)
794 bidi_initialize ();
795 if (charpos >= 0)
796 bidi_it->charpos = charpos;
797 if (bytepos >= 0)
798 bidi_it->bytepos = bytepos;
799 bidi_it->frame_window_p = frame_window_p;
800 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
801 bidi_it->first_elt = 1;
802 bidi_set_paragraph_end (bidi_it);
803 bidi_it->new_paragraph = 1;
804 bidi_it->separator_limit = -1;
805 bidi_it->type = NEUTRAL_B;
806 bidi_it->type_after_w1 = NEUTRAL_B;
807 bidi_it->orig_type = NEUTRAL_B;
808 bidi_it->prev_was_pdf = 0;
809 bidi_it->prev.type = bidi_it->prev.type_after_w1 =
810 bidi_it->prev.orig_type = UNKNOWN_BT;
811 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
812 bidi_it->last_strong.orig_type = UNKNOWN_BT;
813 bidi_it->next_for_neutral.charpos = -1;
814 bidi_it->next_for_neutral.type =
815 bidi_it->next_for_neutral.type_after_w1 =
816 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
817 bidi_it->prev_for_neutral.charpos = -1;
818 bidi_it->prev_for_neutral.type =
819 bidi_it->prev_for_neutral.type_after_w1 =
820 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
821 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
822 bidi_it->disp_pos = -1; /* invalid/unknown */
823 bidi_it->disp_prop_p = 0;
824 /* We can only shrink the cache if we are at the bottom level of its
825 "stack". */
826 if (bidi_cache_start == 0)
827 bidi_cache_shrink ();
828 else
829 bidi_cache_reset ();
830 }
831
832 /* Perform initializations for reordering a new line of bidi text. */
833 static void
834 bidi_line_init (struct bidi_it *bidi_it)
835 {
836 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
837 bidi_it->resolved_level = bidi_it->level_stack[0].level;
838 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
839 bidi_it->invalid_levels = 0;
840 bidi_it->invalid_rl_levels = -1;
841 bidi_it->next_en_pos = -1;
842 bidi_it->next_for_ws.type = UNKNOWN_BT;
843 bidi_set_sor_type (bidi_it,
844 bidi_it->paragraph_dir == R2L ? 1 : 0,
845 bidi_it->level_stack[0].level); /* X10 */
846
847 bidi_cache_reset ();
848 }
849
850 \f
851 /***********************************************************************
852 Fetching characters
853 ***********************************************************************/
854
855 /* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
856 are zero-based character positions in S, BEGBYTE is byte position
857 corresponding to BEG. UNIBYTE, if non-zero, means S is a unibyte
858 string. */
859 static inline EMACS_INT
860 bidi_count_bytes (const unsigned char *s, const EMACS_INT beg,
861 const EMACS_INT begbyte, const EMACS_INT end, int unibyte)
862 {
863 EMACS_INT pos = beg;
864 const unsigned char *p = s + begbyte, *start = p;
865
866 if (unibyte)
867 p = s + end;
868 else
869 {
870 if (!CHAR_HEAD_P (*p))
871 abort ();
872
873 while (pos < end)
874 {
875 p += BYTES_BY_CHAR_HEAD (*p);
876 pos++;
877 }
878 }
879
880 return p - start;
881 }
882
883 /* Fetch and returns the character at byte position BYTEPOS. If S is
884 non-NULL, fetch the character from string S; otherwise fetch the
885 character from the current buffer. UNIBYTE non-zero means S is a
886 unibyte string. */
887 static inline int
888 bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s, int unibyte)
889 {
890 if (s)
891 {
892 if (unibyte)
893 return s[bytepos];
894 else
895 return STRING_CHAR (s + bytepos);
896 }
897 else
898 return FETCH_MULTIBYTE_CHAR (bytepos);
899 }
900
901 /* Fetch and return the character at BYTEPOS/CHARPOS. If that
902 character is covered by a display string, treat the entire run of
903 covered characters as a single character u+FFFC, and return their
904 combined length in CH_LEN and NCHARS. DISP_POS specifies the
905 character position of the next display string, or -1 if not yet
906 computed. DISP_PROP_P non-zero means that there's really a display
907 string at DISP_POS, as opposed to when we searched till DISP_POS
908 without findingone. When the next character is at or beyond that
909 position, the function updates DISP_POS with the position of the
910 next display string. STRING->s is the C string to iterate, or NULL
911 if iterating over a buffer or a Lisp string; in the latter case,
912 STRING->lstring is the Lisp string. */
913 static inline int
914 bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
915 int *disp_prop_p, struct bidi_string_data *string,
916 int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars)
917 {
918 int ch;
919 EMACS_INT endpos =
920 (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
921 struct text_pos pos;
922
923 /* If we got past the last known position of display string, compute
924 the position of the next one. That position could be at CHARPOS. */
925 if (charpos < endpos && charpos > *disp_pos)
926 {
927 SET_TEXT_POS (pos, charpos, bytepos);
928 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p,
929 disp_prop_p);
930 }
931
932 /* Fetch the character at BYTEPOS. */
933 if (charpos >= endpos)
934 {
935 ch = BIDI_EOB;
936 *ch_len = 1;
937 *nchars = 1;
938 *disp_pos = endpos;
939 *disp_prop_p = 0;
940 }
941 else if (charpos >= *disp_pos && *disp_prop_p)
942 {
943 EMACS_INT disp_end_pos;
944
945 /* We don't expect to find ourselves in the middle of a display
946 property. Hopefully, it will never be needed. */
947 if (charpos > *disp_pos)
948 abort ();
949 /* Return the Unicode Object Replacement Character to represent
950 the entire run of characters covered by the display string. */
951 ch = 0xFFFC;
952 disp_end_pos = compute_display_string_end (*disp_pos, string);
953 *nchars = disp_end_pos - *disp_pos;
954 if (*nchars <= 0)
955 abort ();
956 if (string->s)
957 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
958 disp_end_pos, string->unibyte);
959 else if (STRINGP (string->lstring))
960 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
961 bytepos, disp_end_pos, string->unibyte);
962 else
963 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
964 }
965 else
966 {
967 if (string->s)
968 {
969 int len;
970
971 if (!string->unibyte)
972 {
973 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
974 *ch_len = len;
975 }
976 else
977 {
978 ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
979 *ch_len = 1;
980 }
981 }
982 else if (STRINGP (string->lstring))
983 {
984 int len;
985
986 if (!string->unibyte)
987 {
988 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
989 len);
990 *ch_len = len;
991 }
992 else
993 {
994 ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
995 *ch_len = 1;
996 }
997 }
998 else
999 {
1000 ch = FETCH_MULTIBYTE_CHAR (bytepos);
1001 *ch_len = CHAR_BYTES (ch);
1002 }
1003 *nchars = 1;
1004 }
1005
1006 /* If we just entered a run of characters covered by a display
1007 string, compute the position of the next display string. */
1008 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
1009 && *disp_prop_p)
1010 {
1011 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
1012 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p,
1013 disp_prop_p);
1014 }
1015
1016 return ch;
1017 }
1018
1019 \f
1020 /***********************************************************************
1021 Determining paragraph direction
1022 ***********************************************************************/
1023
1024 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1025 Value is the non-negative length of the paragraph separator
1026 following the buffer position, -1 if position is at the beginning
1027 of a new paragraph, or -2 if position is neither at beginning nor
1028 at end of a paragraph. */
1029 static EMACS_INT
1030 bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
1031 {
1032 Lisp_Object sep_re;
1033 Lisp_Object start_re;
1034 EMACS_INT val;
1035
1036 sep_re = paragraph_separate_re;
1037 start_re = paragraph_start_re;
1038
1039 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1040 if (val < 0)
1041 {
1042 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1043 val = -1;
1044 else
1045 val = -2;
1046 }
1047
1048 return val;
1049 }
1050
1051 /* Find the beginning of this paragraph by looking back in the buffer.
1052 Value is the byte position of the paragraph's beginning. */
1053 static EMACS_INT
1054 bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
1055 {
1056 Lisp_Object re = paragraph_start_re;
1057 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
1058
1059 while (pos_byte > BEGV_BYTE
1060 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1061 {
1062 /* FIXME: What if the paragraph beginning is covered by a
1063 display string? And what if a display string covering some
1064 of the text over which we scan back includes
1065 paragraph_start_re? */
1066 pos = find_next_newline_no_quit (pos - 1, -1);
1067 pos_byte = CHAR_TO_BYTE (pos);
1068 }
1069 return pos_byte;
1070 }
1071
1072 /* Determine the base direction, a.k.a. base embedding level, of the
1073 paragraph we are about to iterate through. If DIR is either L2R or
1074 R2L, just use that. Otherwise, determine the paragraph direction
1075 from the first strong directional character of the paragraph.
1076
1077 NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
1078 has no strong directional characters and both DIR and
1079 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1080 in the buffer until a paragraph is found with a strong character,
1081 or until hitting BEGV. In the latter case, fall back to L2R. This
1082 flag is used in current-bidi-paragraph-direction.
1083
1084 Note that this function gives the paragraph separator the same
1085 direction as the preceding paragraph, even though Emacs generally
1086 views the separartor as not belonging to any paragraph. */
1087 void
1088 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
1089 {
1090 EMACS_INT bytepos = bidi_it->bytepos;
1091 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
1092 EMACS_INT pstartbyte;
1093 /* Note that begbyte is a byte position, while end is a character
1094 position. Yes, this is ugly, but we are trying to avoid costly
1095 calls to BYTE_TO_CHAR and its ilk. */
1096 EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE;
1097 EMACS_INT end = string_p ? bidi_it->string.schars : ZV;
1098
1099 /* Special case for an empty buffer. */
1100 if (bytepos == begbyte && bidi_it->charpos == end)
1101 dir = L2R;
1102 /* We should never be called at EOB or before BEGV. */
1103 else if (bidi_it->charpos >= end || bytepos < begbyte)
1104 abort ();
1105
1106 if (dir == L2R)
1107 {
1108 bidi_it->paragraph_dir = L2R;
1109 bidi_it->new_paragraph = 0;
1110 }
1111 else if (dir == R2L)
1112 {
1113 bidi_it->paragraph_dir = R2L;
1114 bidi_it->new_paragraph = 0;
1115 }
1116 else if (dir == NEUTRAL_DIR) /* P2 */
1117 {
1118 int ch;
1119 EMACS_INT ch_len, nchars;
1120 EMACS_INT pos, disp_pos = -1;
1121 int disp_prop_p = 0;
1122 bidi_type_t type;
1123 const unsigned char *s;
1124
1125 if (!bidi_initialized)
1126 bidi_initialize ();
1127
1128 /* If we are inside a paragraph separator, we are just waiting
1129 for the separator to be exhausted; use the previous paragraph
1130 direction. But don't do that if we have been just reseated,
1131 because we need to reinitialize below in that case. */
1132 if (!bidi_it->first_elt
1133 && bidi_it->charpos < bidi_it->separator_limit)
1134 return;
1135
1136 /* If we are on a newline, get past it to where the next
1137 paragraph might start. But don't do that at BEGV since then
1138 we are potentially in a new paragraph that doesn't yet
1139 exist. */
1140 pos = bidi_it->charpos;
1141 s = STRINGP (bidi_it->string.lstring) ?
1142 SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1143 if (bytepos > begbyte
1144 && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1145 {
1146 bytepos++;
1147 pos++;
1148 }
1149
1150 /* We are either at the beginning of a paragraph or in the
1151 middle of it. Find where this paragraph starts. */
1152 if (string_p)
1153 {
1154 /* We don't support changes of paragraph direction inside a
1155 string. It is treated as a single paragraph. */
1156 pstartbyte = 0;
1157 }
1158 else
1159 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1160 bidi_it->separator_limit = -1;
1161 bidi_it->new_paragraph = 0;
1162
1163 /* The following loop is run more than once only if NO_DEFAULT_P
1164 is non-zero, and only if we are iterating on a buffer. */
1165 do {
1166 bytepos = pstartbyte;
1167 if (!string_p)
1168 pos = BYTE_TO_CHAR (bytepos);
1169 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &disp_prop_p,
1170 &bidi_it->string,
1171 bidi_it->frame_window_p, &ch_len, &nchars);
1172 type = bidi_get_type (ch, NEUTRAL_DIR);
1173
1174 for (pos += nchars, bytepos += ch_len;
1175 (bidi_get_category (type) != STRONG)
1176 || (bidi_ignore_explicit_marks_for_paragraph_level
1177 && (type == RLE || type == RLO
1178 || type == LRE || type == LRO));
1179 type = bidi_get_type (ch, NEUTRAL_DIR))
1180 {
1181 if (pos >= end)
1182 {
1183 /* Pretend there's a paragraph separator at end of
1184 buffer/string. */
1185 type = NEUTRAL_B;
1186 break;
1187 }
1188 if (!string_p
1189 && type == NEUTRAL_B
1190 && bidi_at_paragraph_end (pos, bytepos) >= -1)
1191 break;
1192 /* Fetch next character and advance to get past it. */
1193 ch = bidi_fetch_char (bytepos, pos, &disp_pos,
1194 &disp_prop_p, &bidi_it->string,
1195 bidi_it->frame_window_p, &ch_len, &nchars);
1196 pos += nchars;
1197 bytepos += ch_len;
1198 }
1199 if ((type == STRONG_R || type == STRONG_AL) /* P3 */
1200 || (!bidi_ignore_explicit_marks_for_paragraph_level
1201 && (type == RLO || type == RLE)))
1202 bidi_it->paragraph_dir = R2L;
1203 else if (type == STRONG_L
1204 || (!bidi_ignore_explicit_marks_for_paragraph_level
1205 && (type == LRO || type == LRE)))
1206 bidi_it->paragraph_dir = L2R;
1207 if (!string_p
1208 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1209 {
1210 /* If this paragraph is at BEGV, default to L2R. */
1211 if (pstartbyte == BEGV_BYTE)
1212 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1213 else
1214 {
1215 EMACS_INT prevpbyte = pstartbyte;
1216 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1217
1218 /* Find the beginning of the previous paragraph, if any. */
1219 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1220 {
1221 /* FXIME: What if p is covered by a display
1222 string? See also a FIXME inside
1223 bidi_find_paragraph_start. */
1224 p--;
1225 pbyte = CHAR_TO_BYTE (p);
1226 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1227 }
1228 pstartbyte = prevpbyte;
1229 }
1230 }
1231 } while (!string_p
1232 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1233 }
1234 else
1235 abort ();
1236
1237 /* Contrary to UAX#9 clause P3, we only default the paragraph
1238 direction to L2R if we have no previous usable paragraph
1239 direction. This is allowed by the HL1 clause. */
1240 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1241 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1242 if (bidi_it->paragraph_dir == R2L)
1243 bidi_it->level_stack[0].level = 1;
1244 else
1245 bidi_it->level_stack[0].level = 0;
1246
1247 bidi_line_init (bidi_it);
1248 }
1249
1250 \f
1251 /***********************************************************************
1252 Resolving explicit and implicit levels.
1253 The rest of this file constitutes the core of the UBA implementation.
1254 ***********************************************************************/
1255
1256 static inline int
1257 bidi_explicit_dir_char (int ch)
1258 {
1259 bidi_type_t ch_type;
1260
1261 if (!bidi_initialized)
1262 abort ();
1263 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1264 return (ch_type == LRE || ch_type == LRO
1265 || ch_type == RLE || ch_type == RLO
1266 || ch_type == PDF);
1267 }
1268
1269 /* A helper function for bidi_resolve_explicit. It advances to the
1270 next character in logical order and determines the new embedding
1271 level and directional override, but does not take into account
1272 empty embeddings. */
1273 static int
1274 bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1275 {
1276 int curchar;
1277 bidi_type_t type;
1278 int current_level;
1279 int new_level;
1280 bidi_dir_t override;
1281 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
1282
1283 /* If reseat()'ed, don't advance, so as to start iteration from the
1284 position where we were reseated. bidi_it->bytepos can be less
1285 than BEGV_BYTE after reseat to BEGV. */
1286 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1287 || bidi_it->first_elt)
1288 {
1289 bidi_it->first_elt = 0;
1290 if (string_p)
1291 {
1292 const unsigned char *p =
1293 STRINGP (bidi_it->string.lstring)
1294 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1295
1296 if (bidi_it->charpos < 0)
1297 bidi_it->charpos = 0;
1298 bidi_it->bytepos = bidi_count_bytes (p, 0, 0, bidi_it->charpos,
1299 bidi_it->string.unibyte);
1300 }
1301 else
1302 {
1303 if (bidi_it->charpos < BEGV)
1304 bidi_it->charpos = BEGV;
1305 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1306 }
1307 }
1308 /* Don't move at end of buffer/string. */
1309 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1310 {
1311 /* Advance to the next character, skipping characters covered by
1312 display strings (nchars > 1). */
1313 if (bidi_it->nchars <= 0)
1314 abort ();
1315 bidi_it->charpos += bidi_it->nchars;
1316 if (bidi_it->ch_len == 0)
1317 abort ();
1318 bidi_it->bytepos += bidi_it->ch_len;
1319 }
1320
1321 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1322 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1323 new_level = current_level;
1324
1325 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1326 {
1327 curchar = BIDI_EOB;
1328 bidi_it->ch_len = 1;
1329 bidi_it->nchars = 1;
1330 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1331 bidi_it->disp_prop_p = 0;
1332 }
1333 else
1334 {
1335 /* Fetch the character at BYTEPOS. If it is covered by a
1336 display string, treat the entire run of covered characters as
1337 a single character u+FFFC. */
1338 curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
1339 &bidi_it->disp_pos, &bidi_it->disp_prop_p,
1340 &bidi_it->string, bidi_it->frame_window_p,
1341 &bidi_it->ch_len, &bidi_it->nchars);
1342 }
1343 bidi_it->ch = curchar;
1344
1345 /* Don't apply directional override here, as all the types we handle
1346 below will not be affected by the override anyway, and we need
1347 the original type unaltered. The override will be applied in
1348 bidi_resolve_weak. */
1349 type = bidi_get_type (curchar, NEUTRAL_DIR);
1350 bidi_it->orig_type = type;
1351 bidi_check_type (bidi_it->orig_type);
1352
1353 if (type != PDF)
1354 bidi_it->prev_was_pdf = 0;
1355
1356 bidi_it->type_after_w1 = UNKNOWN_BT;
1357
1358 switch (type)
1359 {
1360 case RLE: /* X2 */
1361 case RLO: /* X4 */
1362 bidi_it->type_after_w1 = type;
1363 bidi_check_type (bidi_it->type_after_w1);
1364 type = WEAK_BN; /* X9/Retaining */
1365 if (bidi_it->ignore_bn_limit <= -1)
1366 {
1367 if (current_level <= BIDI_MAXLEVEL - 4)
1368 {
1369 /* Compute the least odd embedding level greater than
1370 the current level. */
1371 new_level = ((current_level + 1) & ~1) + 1;
1372 if (bidi_it->type_after_w1 == RLE)
1373 override = NEUTRAL_DIR;
1374 else
1375 override = R2L;
1376 if (current_level == BIDI_MAXLEVEL - 4)
1377 bidi_it->invalid_rl_levels = 0;
1378 bidi_push_embedding_level (bidi_it, new_level, override);
1379 }
1380 else
1381 {
1382 bidi_it->invalid_levels++;
1383 /* See the commentary about invalid_rl_levels below. */
1384 if (bidi_it->invalid_rl_levels < 0)
1385 bidi_it->invalid_rl_levels = 0;
1386 bidi_it->invalid_rl_levels++;
1387 }
1388 }
1389 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1390 || bidi_it->next_en_pos > bidi_it->charpos)
1391 type = WEAK_EN;
1392 break;
1393 case LRE: /* X3 */
1394 case LRO: /* X5 */
1395 bidi_it->type_after_w1 = type;
1396 bidi_check_type (bidi_it->type_after_w1);
1397 type = WEAK_BN; /* X9/Retaining */
1398 if (bidi_it->ignore_bn_limit <= -1)
1399 {
1400 if (current_level <= BIDI_MAXLEVEL - 5)
1401 {
1402 /* Compute the least even embedding level greater than
1403 the current level. */
1404 new_level = ((current_level + 2) & ~1);
1405 if (bidi_it->type_after_w1 == LRE)
1406 override = NEUTRAL_DIR;
1407 else
1408 override = L2R;
1409 bidi_push_embedding_level (bidi_it, new_level, override);
1410 }
1411 else
1412 {
1413 bidi_it->invalid_levels++;
1414 /* invalid_rl_levels counts invalid levels encountered
1415 while the embedding level was already too high for
1416 LRE/LRO, but not for RLE/RLO. That is because
1417 there may be exactly one PDF which we should not
1418 ignore even though invalid_levels is non-zero.
1419 invalid_rl_levels helps to know what PDF is
1420 that. */
1421 if (bidi_it->invalid_rl_levels >= 0)
1422 bidi_it->invalid_rl_levels++;
1423 }
1424 }
1425 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1426 || bidi_it->next_en_pos > bidi_it->charpos)
1427 type = WEAK_EN;
1428 break;
1429 case PDF: /* X7 */
1430 bidi_it->type_after_w1 = type;
1431 bidi_check_type (bidi_it->type_after_w1);
1432 type = WEAK_BN; /* X9/Retaining */
1433 if (bidi_it->ignore_bn_limit <= -1)
1434 {
1435 if (!bidi_it->invalid_rl_levels)
1436 {
1437 new_level = bidi_pop_embedding_level (bidi_it);
1438 bidi_it->invalid_rl_levels = -1;
1439 if (bidi_it->invalid_levels)
1440 bidi_it->invalid_levels--;
1441 /* else nothing: UAX#9 says to ignore invalid PDFs */
1442 }
1443 if (!bidi_it->invalid_levels)
1444 new_level = bidi_pop_embedding_level (bidi_it);
1445 else
1446 {
1447 bidi_it->invalid_levels--;
1448 bidi_it->invalid_rl_levels--;
1449 }
1450 }
1451 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1452 || bidi_it->next_en_pos > bidi_it->charpos)
1453 type = WEAK_EN;
1454 break;
1455 default:
1456 /* Nothing. */
1457 break;
1458 }
1459
1460 bidi_it->type = type;
1461 bidi_check_type (bidi_it->type);
1462
1463 return new_level;
1464 }
1465
1466 /* Given an iterator state in BIDI_IT, advance one character position
1467 in the buffer/string to the next character (in the logical order),
1468 resolve any explicit embeddings and directional overrides, and
1469 return the embedding level of the character after resolving
1470 explicit directives and ignoring empty embeddings. */
1471 static int
1472 bidi_resolve_explicit (struct bidi_it *bidi_it)
1473 {
1474 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1475 int new_level = bidi_resolve_explicit_1 (bidi_it);
1476 EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
1477 const unsigned char *s = STRINGP (bidi_it->string.lstring)
1478 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1479
1480 if (prev_level < new_level
1481 && bidi_it->type == WEAK_BN
1482 && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1483 && bidi_it->charpos < eob /* not already at EOB */
1484 && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1485 + bidi_it->ch_len, s,
1486 bidi_it->string.unibyte)))
1487 {
1488 /* Avoid pushing and popping embedding levels if the level run
1489 is empty, as this breaks level runs where it shouldn't.
1490 UAX#9 removes all the explicit embedding and override codes,
1491 so empty embeddings disappear without a trace. We need to
1492 behave as if we did the same. */
1493 struct bidi_it saved_it;
1494 int level = prev_level;
1495
1496 bidi_copy_it (&saved_it, bidi_it);
1497
1498 while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1499 + bidi_it->ch_len, s,
1500 bidi_it->string.unibyte)))
1501 {
1502 /* This advances to the next character, skipping any
1503 characters covered by display strings. */
1504 level = bidi_resolve_explicit_1 (bidi_it);
1505 /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1506 a pointer to its data is no longer valid. */
1507 if (STRINGP (bidi_it->string.lstring))
1508 s = SDATA (bidi_it->string.lstring);
1509 }
1510
1511 if (bidi_it->nchars <= 0)
1512 abort ();
1513 if (level == prev_level) /* empty embedding */
1514 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
1515 else /* this embedding is non-empty */
1516 saved_it.ignore_bn_limit = -2;
1517
1518 bidi_copy_it (bidi_it, &saved_it);
1519 if (bidi_it->ignore_bn_limit > -1)
1520 {
1521 /* We pushed a level, but we shouldn't have. Undo that. */
1522 if (!bidi_it->invalid_rl_levels)
1523 {
1524 new_level = bidi_pop_embedding_level (bidi_it);
1525 bidi_it->invalid_rl_levels = -1;
1526 if (bidi_it->invalid_levels)
1527 bidi_it->invalid_levels--;
1528 }
1529 if (!bidi_it->invalid_levels)
1530 new_level = bidi_pop_embedding_level (bidi_it);
1531 else
1532 {
1533 bidi_it->invalid_levels--;
1534 bidi_it->invalid_rl_levels--;
1535 }
1536 }
1537 }
1538
1539 if (bidi_it->type == NEUTRAL_B) /* X8 */
1540 {
1541 bidi_set_paragraph_end (bidi_it);
1542 /* This is needed by bidi_resolve_weak below, and in L1. */
1543 bidi_it->type_after_w1 = bidi_it->type;
1544 bidi_check_type (bidi_it->type_after_w1);
1545 }
1546
1547 return new_level;
1548 }
1549
1550 /* Advance in the buffer/string, resolve weak types and return the
1551 type of the next character after weak type resolution. */
1552 static bidi_type_t
1553 bidi_resolve_weak (struct bidi_it *bidi_it)
1554 {
1555 bidi_type_t type;
1556 bidi_dir_t override;
1557 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1558 int new_level = bidi_resolve_explicit (bidi_it);
1559 int next_char;
1560 bidi_type_t type_of_next;
1561 struct bidi_it saved_it;
1562 EMACS_INT eob =
1563 (STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1564 ? bidi_it->string.schars : ZV;
1565
1566 type = bidi_it->type;
1567 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1568
1569 if (type == UNKNOWN_BT
1570 || type == LRE
1571 || type == LRO
1572 || type == RLE
1573 || type == RLO
1574 || type == PDF)
1575 abort ();
1576
1577 if (new_level != prev_level
1578 || bidi_it->type == NEUTRAL_B)
1579 {
1580 /* We've got a new embedding level run, compute the directional
1581 type of sor and initialize per-run variables (UAX#9, clause
1582 X10). */
1583 bidi_set_sor_type (bidi_it, prev_level, new_level);
1584 }
1585 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1586 || type == WEAK_BN || type == STRONG_AL)
1587 bidi_it->type_after_w1 = type; /* needed in L1 */
1588 bidi_check_type (bidi_it->type_after_w1);
1589
1590 /* Level and directional override status are already recorded in
1591 bidi_it, and do not need any change; see X6. */
1592 if (override == R2L) /* X6 */
1593 type = STRONG_R;
1594 else if (override == L2R)
1595 type = STRONG_L;
1596 else
1597 {
1598 if (type == WEAK_NSM) /* W1 */
1599 {
1600 /* Note that we don't need to consider the case where the
1601 prev character has its type overridden by an RLO or LRO,
1602 because then either the type of this NSM would have been
1603 also overridden, or the previous character is outside the
1604 current level run, and thus not relevant to this NSM.
1605 This is why NSM gets the type_after_w1 of the previous
1606 character. */
1607 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1608 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1609 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
1610 type = bidi_it->prev.type_after_w1;
1611 else if (bidi_it->sor == R2L)
1612 type = STRONG_R;
1613 else if (bidi_it->sor == L2R)
1614 type = STRONG_L;
1615 else /* shouldn't happen! */
1616 abort ();
1617 }
1618 if (type == WEAK_EN /* W2 */
1619 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1620 type = WEAK_AN;
1621 else if (type == STRONG_AL) /* W3 */
1622 type = STRONG_R;
1623 else if ((type == WEAK_ES /* W4 */
1624 && bidi_it->prev.type_after_w1 == WEAK_EN
1625 && bidi_it->prev.orig_type == WEAK_EN)
1626 || (type == WEAK_CS
1627 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1628 && bidi_it->prev.orig_type == WEAK_EN)
1629 || bidi_it->prev.type_after_w1 == WEAK_AN)))
1630 {
1631 const unsigned char *s =
1632 STRINGP (bidi_it->string.lstring)
1633 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1634
1635 next_char =
1636 bidi_it->charpos + bidi_it->nchars >= eob
1637 ? BIDI_EOB
1638 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
1639 bidi_it->string.unibyte);
1640 type_of_next = bidi_get_type (next_char, override);
1641
1642 if (type_of_next == WEAK_BN
1643 || bidi_explicit_dir_char (next_char))
1644 {
1645 bidi_copy_it (&saved_it, bidi_it);
1646 while (bidi_resolve_explicit (bidi_it) == new_level
1647 && bidi_it->type == WEAK_BN)
1648 ;
1649 type_of_next = bidi_it->type;
1650 bidi_copy_it (bidi_it, &saved_it);
1651 }
1652
1653 /* If the next character is EN, but the last strong-type
1654 character is AL, that next EN will be changed to AN when
1655 we process it in W2 above. So in that case, this ES
1656 should not be changed into EN. */
1657 if (type == WEAK_ES
1658 && type_of_next == WEAK_EN
1659 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1660 type = WEAK_EN;
1661 else if (type == WEAK_CS)
1662 {
1663 if (bidi_it->prev.type_after_w1 == WEAK_AN
1664 && (type_of_next == WEAK_AN
1665 /* If the next character is EN, but the last
1666 strong-type character is AL, EN will be later
1667 changed to AN when we process it in W2 above.
1668 So in that case, this ES should not be
1669 changed into EN. */
1670 || (type_of_next == WEAK_EN
1671 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1672 type = WEAK_AN;
1673 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1674 && type_of_next == WEAK_EN
1675 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1676 type = WEAK_EN;
1677 }
1678 }
1679 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1680 || type == WEAK_BN) /* W5/Retaining */
1681 {
1682 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1683 || bidi_it->next_en_pos > bidi_it->charpos)
1684 type = WEAK_EN;
1685 else /* W5: ET/BN with EN after it. */
1686 {
1687 EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
1688 const unsigned char *s =
1689 STRINGP (bidi_it->string.lstring)
1690 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1691
1692 if (bidi_it->nchars <= 0)
1693 abort ();
1694 next_char =
1695 bidi_it->charpos + bidi_it->nchars >= eob
1696 ? BIDI_EOB
1697 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
1698 bidi_it->string.unibyte);
1699 type_of_next = bidi_get_type (next_char, override);
1700
1701 if (type_of_next == WEAK_ET
1702 || type_of_next == WEAK_BN
1703 || bidi_explicit_dir_char (next_char))
1704 {
1705 bidi_copy_it (&saved_it, bidi_it);
1706 while (bidi_resolve_explicit (bidi_it) == new_level
1707 && (bidi_it->type == WEAK_BN
1708 || bidi_it->type == WEAK_ET))
1709 ;
1710 type_of_next = bidi_it->type;
1711 en_pos = bidi_it->charpos;
1712 bidi_copy_it (bidi_it, &saved_it);
1713 }
1714 if (type_of_next == WEAK_EN)
1715 {
1716 /* If the last strong character is AL, the EN we've
1717 found will become AN when we get to it (W2). */
1718 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1719 {
1720 type = WEAK_EN;
1721 /* Remember this EN position, to speed up processing
1722 of the next ETs. */
1723 bidi_it->next_en_pos = en_pos;
1724 }
1725 else if (type == WEAK_BN)
1726 type = NEUTRAL_ON; /* W6/Retaining */
1727 }
1728 }
1729 }
1730 }
1731
1732 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
1733 || (type == WEAK_BN
1734 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1735 || bidi_it->prev.type_after_w1 == WEAK_ES
1736 || bidi_it->prev.type_after_w1 == WEAK_ET)))
1737 type = NEUTRAL_ON;
1738
1739 /* Store the type we've got so far, before we clobber it with strong
1740 types in W7 and while resolving neutral types. But leave alone
1741 the original types that were recorded above, because we will need
1742 them for the L1 clause. */
1743 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1744 bidi_it->type_after_w1 = type;
1745 bidi_check_type (bidi_it->type_after_w1);
1746
1747 if (type == WEAK_EN) /* W7 */
1748 {
1749 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
1750 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1751 type = STRONG_L;
1752 }
1753
1754 bidi_it->type = type;
1755 bidi_check_type (bidi_it->type);
1756 return type;
1757 }
1758
1759 /* Resolve the type of a neutral character according to the type of
1760 surrounding strong text and the current embedding level. */
1761 static inline bidi_type_t
1762 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1763 {
1764 /* N1: European and Arabic numbers are treated as though they were R. */
1765 if (next_type == WEAK_EN || next_type == WEAK_AN)
1766 next_type = STRONG_R;
1767 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1768 prev_type = STRONG_R;
1769
1770 if (next_type == prev_type) /* N1 */
1771 return next_type;
1772 else if ((lev & 1) == 0) /* N2 */
1773 return STRONG_L;
1774 else
1775 return STRONG_R;
1776 }
1777
1778 static bidi_type_t
1779 bidi_resolve_neutral (struct bidi_it *bidi_it)
1780 {
1781 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1782 bidi_type_t type = bidi_resolve_weak (bidi_it);
1783 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1784
1785 if (!(type == STRONG_R
1786 || type == STRONG_L
1787 || type == WEAK_BN
1788 || type == WEAK_EN
1789 || type == WEAK_AN
1790 || type == NEUTRAL_B
1791 || type == NEUTRAL_S
1792 || type == NEUTRAL_WS
1793 || type == NEUTRAL_ON))
1794 abort ();
1795
1796 if (bidi_get_category (type) == NEUTRAL
1797 || (type == WEAK_BN && prev_level == current_level))
1798 {
1799 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1800 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1801 bidi_it->next_for_neutral.type,
1802 current_level);
1803 else
1804 {
1805 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1806 the assumption of batch-style processing; see clauses W4,
1807 W5, and especially N1, which require to look far forward
1808 (as well as back) in the buffer/string. May the fleas of
1809 a thousand camels infest the armpits of those who design
1810 supposedly general-purpose algorithms by looking at their
1811 own implementations, and fail to consider other possible
1812 implementations! */
1813 struct bidi_it saved_it;
1814 bidi_type_t next_type;
1815
1816 if (bidi_it->scan_dir == -1)
1817 abort ();
1818
1819 bidi_copy_it (&saved_it, bidi_it);
1820 /* Scan the text forward until we find the first non-neutral
1821 character, and then use that to resolve the neutral we
1822 are dealing with now. We also cache the scanned iterator
1823 states, to salvage some of the effort later. */
1824 bidi_cache_iterator_state (bidi_it, 0);
1825 do {
1826 /* Record the info about the previous character, so that
1827 it will be cached below with this state. */
1828 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1829 && bidi_it->type != WEAK_BN)
1830 bidi_remember_char (&bidi_it->prev, bidi_it);
1831 type = bidi_resolve_weak (bidi_it);
1832 /* Paragraph separators have their levels fully resolved
1833 at this point, so cache them as resolved. */
1834 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1835 /* FIXME: implement L1 here, by testing for a newline and
1836 resetting the level for any sequence of whitespace
1837 characters adjacent to it. */
1838 } while (!(type == NEUTRAL_B
1839 || (type != WEAK_BN
1840 && bidi_get_category (type) != NEUTRAL)
1841 /* This is all per level run, so stop when we
1842 reach the end of this level run. */
1843 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1844 current_level));
1845
1846 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1847
1848 switch (type)
1849 {
1850 case STRONG_L:
1851 case STRONG_R:
1852 case STRONG_AL:
1853 next_type = type;
1854 break;
1855 case WEAK_EN:
1856 case WEAK_AN:
1857 /* N1: ``European and Arabic numbers are treated as
1858 though they were R.'' */
1859 next_type = STRONG_R;
1860 saved_it.next_for_neutral.type = STRONG_R;
1861 break;
1862 case WEAK_BN:
1863 if (!bidi_explicit_dir_char (bidi_it->ch))
1864 abort (); /* can't happen: BNs are skipped */
1865 /* FALLTHROUGH */
1866 case NEUTRAL_B:
1867 /* Marched all the way to the end of this level run.
1868 We need to use the eor type, whose information is
1869 stored by bidi_set_sor_type in the prev_for_neutral
1870 member. */
1871 if (saved_it.type != WEAK_BN
1872 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
1873 {
1874 next_type = bidi_it->prev_for_neutral.type;
1875 saved_it.next_for_neutral.type = next_type;
1876 bidi_check_type (next_type);
1877 }
1878 else
1879 {
1880 /* This is a BN which does not adjoin neutrals.
1881 Leave its type alone. */
1882 bidi_copy_it (bidi_it, &saved_it);
1883 return bidi_it->type;
1884 }
1885 break;
1886 default:
1887 abort ();
1888 }
1889 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1890 next_type, current_level);
1891 saved_it.type = type;
1892 bidi_check_type (type);
1893 bidi_copy_it (bidi_it, &saved_it);
1894 }
1895 }
1896 return type;
1897 }
1898
1899 /* Given an iterator state in BIDI_IT, advance one character position
1900 in the buffer/string to the next character (in the logical order),
1901 resolve the bidi type of that next character, and return that
1902 type. */
1903 static bidi_type_t
1904 bidi_type_of_next_char (struct bidi_it *bidi_it)
1905 {
1906 bidi_type_t type;
1907
1908 /* This should always be called during a forward scan. */
1909 if (bidi_it->scan_dir != 1)
1910 abort ();
1911
1912 /* Reset the limit until which to ignore BNs if we step out of the
1913 area where we found only empty levels. */
1914 if ((bidi_it->ignore_bn_limit > -1
1915 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1916 || (bidi_it->ignore_bn_limit == -2
1917 && !bidi_explicit_dir_char (bidi_it->ch)))
1918 bidi_it->ignore_bn_limit = -1;
1919
1920 type = bidi_resolve_neutral (bidi_it);
1921
1922 return type;
1923 }
1924
1925 /* Given an iterator state BIDI_IT, advance one character position in
1926 the buffer/string to the next character (in the current scan
1927 direction), resolve the embedding and implicit levels of that next
1928 character, and return the resulting level. */
1929 static int
1930 bidi_level_of_next_char (struct bidi_it *bidi_it)
1931 {
1932 bidi_type_t type;
1933 int level, prev_level = -1;
1934 struct bidi_saved_info next_for_neutral;
1935 EMACS_INT next_char_pos = -2;
1936
1937 if (bidi_it->scan_dir == 1)
1938 {
1939 EMACS_INT eob =
1940 (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1941 ? bidi_it->string.schars : ZV;
1942
1943 /* There's no sense in trying to advance if we hit end of text. */
1944 if (bidi_it->charpos >= eob)
1945 return bidi_it->resolved_level;
1946
1947 /* Record the info about the previous character. */
1948 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1949 && bidi_it->type != WEAK_BN)
1950 bidi_remember_char (&bidi_it->prev, bidi_it);
1951 if (bidi_it->type_after_w1 == STRONG_R
1952 || bidi_it->type_after_w1 == STRONG_L
1953 || bidi_it->type_after_w1 == STRONG_AL)
1954 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1955 /* FIXME: it sounds like we don't need both prev and
1956 prev_for_neutral members, but I'm leaving them both for now. */
1957 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1958 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1959 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1960
1961 /* If we overstepped the characters used for resolving neutrals
1962 and whitespace, invalidate their info in the iterator. */
1963 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1964 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1965 if (bidi_it->next_en_pos >= 0
1966 && bidi_it->charpos >= bidi_it->next_en_pos)
1967 bidi_it->next_en_pos = -1;
1968 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1969 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1970 bidi_it->next_for_ws.type = UNKNOWN_BT;
1971
1972 /* This must be taken before we fill the iterator with the info
1973 about the next char. If we scan backwards, the iterator
1974 state must be already cached, so there's no need to know the
1975 embedding level of the previous character, since we will be
1976 returning to our caller shortly. */
1977 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1978 }
1979 next_for_neutral = bidi_it->next_for_neutral;
1980
1981 /* Perhaps the character we want is already cached. If it is, the
1982 call to bidi_cache_find below will return a type other than
1983 UNKNOWN_BT. */
1984 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
1985 {
1986 int bob =
1987 (bidi_it->string.s || STRINGP (bidi_it->string.lstring)) ? 0 : 1;
1988
1989 if (bidi_it->scan_dir > 0)
1990 {
1991 if (bidi_it->nchars <= 0)
1992 abort ();
1993 next_char_pos = bidi_it->charpos + bidi_it->nchars;
1994 }
1995 else if (bidi_it->charpos >= bob)
1996 /* Implementation note: we allow next_char_pos to be as low as
1997 0 for buffers or -1 for strings, and that is okay because
1998 that's the "position" of the sentinel iterator state we
1999 cached at the beginning of the iteration. */
2000 next_char_pos = bidi_it->charpos - 1;
2001 if (next_char_pos >= bob - 1)
2002 type = bidi_cache_find (next_char_pos, -1, bidi_it);
2003 else
2004 type = UNKNOWN_BT;
2005 }
2006 else
2007 type = UNKNOWN_BT;
2008 if (type != UNKNOWN_BT)
2009 {
2010 /* Don't lose the information for resolving neutrals! The
2011 cached states could have been cached before their
2012 next_for_neutral member was computed. If we are on our way
2013 forward, we can simply take the info from the previous
2014 state. */
2015 if (bidi_it->scan_dir == 1
2016 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
2017 bidi_it->next_for_neutral = next_for_neutral;
2018
2019 /* If resolved_level is -1, it means this state was cached
2020 before it was completely resolved, so we cannot return
2021 it. */
2022 if (bidi_it->resolved_level != -1)
2023 return bidi_it->resolved_level;
2024 }
2025 if (bidi_it->scan_dir == -1)
2026 /* If we are going backwards, the iterator state is already cached
2027 from previous scans, and should be fully resolved. */
2028 abort ();
2029
2030 if (type == UNKNOWN_BT)
2031 type = bidi_type_of_next_char (bidi_it);
2032
2033 if (type == NEUTRAL_B)
2034 return bidi_it->resolved_level;
2035
2036 level = bidi_it->level_stack[bidi_it->stack_idx].level;
2037 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
2038 || (type == WEAK_BN && prev_level == level))
2039 {
2040 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
2041 abort ();
2042
2043 /* If the cached state shows a neutral character, it was not
2044 resolved by bidi_resolve_neutral, so do it now. */
2045 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2046 bidi_it->next_for_neutral.type,
2047 level);
2048 }
2049
2050 if (!(type == STRONG_R
2051 || type == STRONG_L
2052 || type == WEAK_BN
2053 || type == WEAK_EN
2054 || type == WEAK_AN))
2055 abort ();
2056 bidi_it->type = type;
2057 bidi_check_type (bidi_it->type);
2058
2059 /* For L1 below, we need to know, for each WS character, whether
2060 it belongs to a sequence of WS characters preceding a newline
2061 or a TAB or a paragraph separator. */
2062 if (bidi_it->orig_type == NEUTRAL_WS
2063 && bidi_it->next_for_ws.type == UNKNOWN_BT)
2064 {
2065 int ch;
2066 EMACS_INT clen = bidi_it->ch_len;
2067 EMACS_INT bpos = bidi_it->bytepos;
2068 EMACS_INT cpos = bidi_it->charpos;
2069 EMACS_INT disp_pos = bidi_it->disp_pos;
2070 EMACS_INT nc = bidi_it->nchars;
2071 struct bidi_string_data bs = bidi_it->string;
2072 bidi_type_t chtype;
2073 int fwp = bidi_it->frame_window_p;
2074 int dpp = bidi_it->disp_prop_p;
2075
2076 if (bidi_it->nchars <= 0)
2077 abort ();
2078 do {
2079 ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &dpp, &bs,
2080 fwp, &clen, &nc);
2081 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
2082 chtype = NEUTRAL_B;
2083 else
2084 chtype = bidi_get_type (ch, NEUTRAL_DIR);
2085 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
2086 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
2087 bidi_it->next_for_ws.type = chtype;
2088 bidi_check_type (bidi_it->next_for_ws.type);
2089 bidi_it->next_for_ws.charpos = cpos;
2090 bidi_it->next_for_ws.bytepos = bpos;
2091 }
2092
2093 /* Resolve implicit levels, with a twist: PDFs get the embedding
2094 level of the enbedding they terminate. See below for the
2095 reason. */
2096 if (bidi_it->orig_type == PDF
2097 /* Don't do this if this formatting code didn't change the
2098 embedding level due to invalid or empty embeddings. */
2099 && prev_level != level)
2100 {
2101 /* Don't look in UAX#9 for the reason for this: it's our own
2102 private quirk. The reason is that we want the formatting
2103 codes to be delivered so that they bracket the text of their
2104 embedding. For example, given the text
2105
2106 {RLO}teST{PDF}
2107
2108 we want it to be displayed as
2109
2110 {PDF}STet{RLO}
2111
2112 not as
2113
2114 STet{RLO}{PDF}
2115
2116 which will result because we bump up the embedding level as
2117 soon as we see the RLO and pop it as soon as we see the PDF,
2118 so RLO itself has the same embedding level as "teST", and
2119 thus would be normally delivered last, just before the PDF.
2120 The switch below fiddles with the level of PDF so that this
2121 ugly side effect does not happen.
2122
2123 (This is, of course, only important if the formatting codes
2124 are actually displayed, but Emacs does need to display them
2125 if the user wants to.) */
2126 level = prev_level;
2127 }
2128 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
2129 || bidi_it->orig_type == NEUTRAL_S
2130 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
2131 /* || bidi_it->ch == LINESEP_CHAR */
2132 || (bidi_it->orig_type == NEUTRAL_WS
2133 && (bidi_it->next_for_ws.type == NEUTRAL_B
2134 || bidi_it->next_for_ws.type == NEUTRAL_S)))
2135 level = bidi_it->level_stack[0].level;
2136 else if ((level & 1) == 0) /* I1 */
2137 {
2138 if (type == STRONG_R)
2139 level++;
2140 else if (type == WEAK_EN || type == WEAK_AN)
2141 level += 2;
2142 }
2143 else /* I2 */
2144 {
2145 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
2146 level++;
2147 }
2148
2149 bidi_it->resolved_level = level;
2150 return level;
2151 }
2152
2153 /* Move to the other edge of a level given by LEVEL. If END_FLAG is
2154 non-zero, we are at the end of a level, and we need to prepare to
2155 resume the scan of the lower level.
2156
2157 If this level's other edge is cached, we simply jump to it, filling
2158 the iterator structure with the iterator state on the other edge.
2159 Otherwise, we walk the buffer or string until we come back to the
2160 same level as LEVEL.
2161
2162 Note: we are not talking here about a ``level run'' in the UAX#9
2163 sense of the term, but rather about a ``level'' which includes
2164 all the levels higher than it. In other words, given the levels
2165 like this:
2166
2167 11111112222222333333334443343222222111111112223322111
2168 A B C
2169
2170 and assuming we are at point A scanning left to right, this
2171 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
2172 at point B. */
2173 static void
2174 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
2175 {
2176 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
2177 ptrdiff_t idx;
2178
2179 /* Try the cache first. */
2180 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
2181 >= bidi_cache_start)
2182 bidi_cache_fetch_state (idx, bidi_it);
2183 else
2184 {
2185 int new_level;
2186
2187 if (end_flag)
2188 abort (); /* if we are at end of level, its edges must be cached */
2189
2190 bidi_cache_iterator_state (bidi_it, 1);
2191 do {
2192 new_level = bidi_level_of_next_char (bidi_it);
2193 bidi_cache_iterator_state (bidi_it, 1);
2194 } while (new_level >= level);
2195 }
2196 }
2197
2198 void
2199 bidi_move_to_visually_next (struct bidi_it *bidi_it)
2200 {
2201 int old_level, new_level, next_level;
2202 struct bidi_it sentinel;
2203 struct gcpro gcpro1;
2204
2205 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
2206 abort ();
2207
2208 if (bidi_it->scan_dir == 0)
2209 {
2210 bidi_it->scan_dir = 1; /* default to logical order */
2211 }
2212
2213 /* The code below can call eval, and thus cause GC. If we are
2214 iterating a Lisp string, make sure it won't be GCed. */
2215 if (STRINGP (bidi_it->string.lstring))
2216 GCPRO1 (bidi_it->string.lstring);
2217
2218 /* If we just passed a newline, initialize for the next line. */
2219 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
2220 bidi_line_init (bidi_it);
2221
2222 /* Prepare the sentinel iterator state, and cache it. When we bump
2223 into it, scanning backwards, we'll know that the last non-base
2224 level is exhausted. */
2225 if (bidi_cache_idx == bidi_cache_start)
2226 {
2227 bidi_copy_it (&sentinel, bidi_it);
2228 if (bidi_it->first_elt)
2229 {
2230 sentinel.charpos--; /* cached charpos needs to be monotonic */
2231 sentinel.bytepos--;
2232 sentinel.ch = '\n'; /* doesn't matter, but why not? */
2233 sentinel.ch_len = 1;
2234 sentinel.nchars = 1;
2235 }
2236 bidi_cache_iterator_state (&sentinel, 1);
2237 }
2238
2239 old_level = bidi_it->resolved_level;
2240 new_level = bidi_level_of_next_char (bidi_it);
2241
2242 /* Reordering of resolved levels (clause L2) is implemented by
2243 jumping to the other edge of the level and flipping direction of
2244 scanning the text whenever we find a level change. */
2245 if (new_level != old_level)
2246 {
2247 int ascending = new_level > old_level;
2248 int level_to_search = ascending ? old_level + 1 : old_level;
2249 int incr = ascending ? 1 : -1;
2250 int expected_next_level = old_level + incr;
2251
2252 /* Jump (or walk) to the other edge of this level. */
2253 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2254 /* Switch scan direction and peek at the next character in the
2255 new direction. */
2256 bidi_it->scan_dir = -bidi_it->scan_dir;
2257
2258 /* The following loop handles the case where the resolved level
2259 jumps by more than one. This is typical for numbers inside a
2260 run of text with left-to-right embedding direction, but can
2261 also happen in other situations. In those cases the decision
2262 where to continue after a level change, and in what direction,
2263 is tricky. For example, given a text like below:
2264
2265 abcdefgh
2266 11336622
2267
2268 (where the numbers below the text show the resolved levels),
2269 the result of reordering according to UAX#9 should be this:
2270
2271 efdcghba
2272
2273 This is implemented by the loop below which flips direction
2274 and jumps to the other edge of the level each time it finds
2275 the new level not to be the expected one. The expected level
2276 is always one more or one less than the previous one. */
2277 next_level = bidi_peek_at_next_level (bidi_it);
2278 while (next_level != expected_next_level)
2279 {
2280 expected_next_level += incr;
2281 level_to_search += incr;
2282 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2283 bidi_it->scan_dir = -bidi_it->scan_dir;
2284 next_level = bidi_peek_at_next_level (bidi_it);
2285 }
2286
2287 /* Finally, deliver the next character in the new direction. */
2288 next_level = bidi_level_of_next_char (bidi_it);
2289 }
2290
2291 /* Take note when we have just processed the newline that precedes
2292 the end of the paragraph. The next time we are about to be
2293 called, set_iterator_to_next will automatically reinit the
2294 paragraph direction, if needed. We do this at the newline before
2295 the paragraph separator, because the next character might not be
2296 the first character of the next paragraph, due to the bidi
2297 reordering, whereas we _must_ know the paragraph base direction
2298 _before_ we process the paragraph's text, since the base
2299 direction affects the reordering. */
2300 if (bidi_it->scan_dir == 1 && bidi_it->orig_type == NEUTRAL_B)
2301 {
2302 /* The paragraph direction of the entire string, once
2303 determined, is in effect for the entire string. Setting the
2304 separator limit to the end of the string prevents
2305 bidi_paragraph_init from being called automatically on this
2306 string. */
2307 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2308 bidi_it->separator_limit = bidi_it->string.schars;
2309 else if (bidi_it->bytepos < ZV_BYTE)
2310 {
2311 EMACS_INT sep_len =
2312 bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2313 bidi_it->bytepos + bidi_it->ch_len);
2314 if (bidi_it->nchars <= 0)
2315 abort ();
2316 if (sep_len >= 0)
2317 {
2318 bidi_it->new_paragraph = 1;
2319 /* Record the buffer position of the last character of the
2320 paragraph separator. */
2321 bidi_it->separator_limit =
2322 bidi_it->charpos + bidi_it->nchars + sep_len;
2323 }
2324 }
2325 }
2326
2327 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
2328 {
2329 /* If we are at paragraph's base embedding level and beyond the
2330 last cached position, the cache's job is done and we can
2331 discard it. */
2332 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
2333 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2334 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
2335 bidi_cache_reset ();
2336 /* But as long as we are caching during forward scan, we must
2337 cache each state, or else the cache integrity will be
2338 compromised: it assumes cached states correspond to buffer
2339 positions 1:1. */
2340 else
2341 bidi_cache_iterator_state (bidi_it, 1);
2342 }
2343
2344 if (STRINGP (bidi_it->string.lstring))
2345 UNGCPRO;
2346 }
2347
2348 /* This is meant to be called from within the debugger, whenever you
2349 wish to examine the cache contents. */
2350 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
2351 void
2352 bidi_dump_cached_states (void)
2353 {
2354 ptrdiff_t i;
2355 int ndigits = 1;
2356
2357 if (bidi_cache_idx == 0)
2358 {
2359 fprintf (stderr, "The cache is empty.\n");
2360 return;
2361 }
2362 fprintf (stderr, "Total of %"pD"d state%s in cache:\n",
2363 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2364
2365 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2366 ndigits++;
2367 fputs ("ch ", stderr);
2368 for (i = 0; i < bidi_cache_idx; i++)
2369 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2370 fputs ("\n", stderr);
2371 fputs ("lvl ", stderr);
2372 for (i = 0; i < bidi_cache_idx; i++)
2373 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2374 fputs ("\n", stderr);
2375 fputs ("pos ", stderr);
2376 for (i = 0; i < bidi_cache_idx; i++)
2377 fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
2378 fputs ("\n", stderr);
2379 }