1 ;;; lex.el --- Lexical analyser construction
3 ;; Copyright (C) 2008,2013 Free Software Foundation, Inc.
5 ;; Author: Stefan Monnier <monnier@iro.umontreal.ca>
8 ;; This program is free software; you can redistribute it and/or modify
9 ;; it under the terms of the GNU General Public License as published by
10 ;; the Free Software Foundation, either version 3 of the License, or
11 ;; (at your option) any later version.
13 ;; This program is distributed in the hope that it will be useful,
14 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;; GNU General Public License for more details.
18 ;; You should have received a copy of the GNU General Public License
19 ;; along with this program. If not, see <http://www.gnu.org/licenses/>.
23 ;; Format of regexps is the same as used for `rx' and `sregex'.
25 ;; - (ere RE) specify regexps using the ERE syntax.
26 ;; - (inter REs...) (aka `&') make a regexp that only matches
27 ;; if all its branches match. E.g. (inter (ere ".*a.*") (ere ".*b.*"))
28 ;; match any string that contain both an "a" and a "b", in any order.
29 ;; - (case-fold REs...) and (case-sensitive REs...) make a regexp that
30 ;; is case sensitive or not, regardless of case-fold-search.
32 ;; Input format of lexers:
34 ;; ALIST of the form ((RE . VAL) ...)
36 ;; Format of compiled DFA lexers:
38 ;; nil ; The trivial lexer that fails
40 ;; (table . CHAR-TABLE)
41 ;; (stop VAL . LEXER) ; Match the empty string at point or LEXER.
42 ;; (check (PREDICATE . ARG) SUCCESS-LEXER . FAILURE-LEXER)
44 ;; Intermediate NFA nodes may additionally look like:
49 ;; Note: we call those things "NFA"s but they're not really NFAs.
53 ;; - `inter' doesn't work right. Matching `join' to the corresponding `and'
54 ;; is done incorrectly in some cases.
55 ;; - since `negate' uses intersections, it doesn't work right either.
56 ;; - "(\<)*" leads to a DFA that gets stuck in a cycle.
60 ;; - dfa "no-fail" simplifier
62 ;; - dfa compaction (different representation)
65 ;; - search rather than just match
67 ;; - repeated submatches
69 ;; - lookbehind and lookahead
70 ;; - match(&search?) backward
80 ;; To turn a match into a search, the basic idea is to use ".*RE" to get
81 ;; a search-DFA as opposed to the match-DFA generated from "RE".
83 ;; Search in Plan9's regexp library is done as follows: match ".*RE" until
84 ;; reaching the first match and then continue with only "RE". The first
85 ;; ".*RE" match corresponds to a search success for the leftmost shortest
86 ;; match. If we want the longest match, we need to continue. But if we
87 ;; continue with ".*RE" then we have no idea when to stop, so we should only
88 ;; continue with "RE".
89 ;; Downside: we may still match things after the "leftmost longest" match,
90 ;; but hopefully will stop soon after. I.e. we may look at chars past the
91 ;; end of the leftmost longest match, but hopefully not too many.
94 ;; - Like emacs/src/regexp.c, we can just start a match at every buffer
95 ;; position. Advantage: no need for submatch info in order to find
96 ;; (match-beginning 0), no need for a separate search-DFA.
97 ;; Downsize: O(N^2) rather than O(N). But it's no worse than what we live
98 ;; with for decades in src/regexp.c.
100 ;; - After the shortest-search, stop the search and do a longest-match
101 ;; starting at position (match-beginning 0). The good thing is that we
102 ;; will not look at any char further than needed. Also we don't need to
103 ;; figure out how to switch from ".*RE" to "RE" in the middle of the search.
104 ;; The downside is that we end up looking twice at the chars common to the
105 ;; shortest and longest matches. Also this doesn't work: the shortest
106 ;; match may not be the leftmost match, so we can't just start the match
107 ;; at (match-beginning 0).
109 ;; - Generate a specialized search&match-DFA which encodes the job done by
110 ;; Plan9's regexp library. I.e. do a specialized merge on
111 ;; (or LEXER (anything . LEXER)) where whenever we get a `stop' we don't
112 ;; merge any more. After matching such a lexer, we still have to figure
113 ;; which of the matches we had is the leftmost longest match, of course.
114 ;; Actually, it's not that easy: the tail of a `stop' in the match-DFA can
115 ;; only match things whose (match-beginning 0) may be the same as the one
116 ;; of the `stop', whereas we also want to accept longer matches that start
117 ;; before (match-beginning 0). So we want to keep merging on the tail of
118 ;; `stop' nodes, but only "partially" (whatever that means).
120 ;; - Better yet, do what TRE does: after the shortest-search, use the
121 ;; submatch data to figure out the NFA states (corresponding to the
122 ;; current search-DFA state) which are only reachable from later starting
123 ;; positions than (match-beginning 0), remove them and figure out from
124 ;; that the match-DFA state to which to switch. Problem is: there might
125 ;; not be any such state in the match-DFA.
127 ;; - In the end I do a mix of the last 2: .*?RE
128 ;; This uses the `orelse' merge operator, which contrary to `or' only
129 ;; matches the righthand side when the lefthand side fails to match.
130 ;; It turns out to be fairly simple to implement, and is optimal.
135 ;; I suspect that the (?=<RE>) lookahead can be encoded using something like
136 ;; `andalso'. Of course, it can also trivially be encoded as a predicate,
137 ;; but then we get an O(N^2) complexity.
139 ;; Merging operators.
140 ;; ------------------
142 ;; The NFA merging operators (or, and, orelse) seem to work fine on their own,
143 ;; but I'm not convinced they always DTRT when combined. It's not even
144 ;; clear that the NFA->DFA conversion terminates in all such cases.
149 ;; Implementing the `inter' regexp operator turns out to be more difficult
150 ;; than it seemed. The problem is basically in the `join'. Each `and' has
151 ;; to have its own matching `join', but preserving this invariant is
152 ;; tricky. Among other things, we cannot flatten nested `and's like we do
153 ;; for `or's and `orelse's.
158 ;; Keeping track of submatch info with a DFA is tricky business and can slow
159 ;; down the matcher or make it use algorithmically more memory
160 ;; (e.g. O(textsize)). Here are some approaches:
162 ;; - Reproduce what an NFA matcher would do: when compiling the DFA, keep
163 ;; track of the NFA nodes corresponding to each DFA node, and for every
164 ;; transition, check the mapping between "incoming NFA nodes" and
165 ;; "outgoing NFA nodes" to maintain the list of submatch-info (one element
168 ;; - Keep a log of the states traversed during matching, so at the end it
169 ;; can be used to reproduce the parse tree or submatch info, based on
170 ;; auxiliary tables constructed during the DFA construction.
172 ;; - Some submatch info can be maintained cheaply: basically a submatch
173 ;; position can be represented by a single global variable in the case
174 ;; where we have the following property: every ε transition in the NFA
175 ;; which corresponds to this submatch point has the following property:
176 ;; no other ε transition for this same submatch can be traversed between
177 ;; the text position where this transition is traversed and the position
178 ;; where the target NFA subgraph fails to match.
184 (eval-when-compile (require 'cl-lib))
187 (unless (fboundp 'case-table-get-table)
189 (defun case-table-get-table (case-table table)
190 "Return the TABLE of CASE-TABLE.
191 TABLE can be `down', `up', `eqv' or `canon'."
192 (let ((slot-nb (cdr (assq table '((up . 0) (canon . 1) (eqv . 2))))))
193 (or (if (eq table 'down) case-table)
194 (char-table-extra-slot case-table slot-nb)
195 (let ((old (standard-case-table)))
198 (set-standard-case-table case-table)
199 (char-table-extra-slot case-table slot-nb))
200 (or (eq case-table old)
201 (set-standard-case-table old)))))))))
203 (defun copy-char-table (ct1)
204 (let* ((subtype (char-table-subtype ct1))
205 (ct2 (make-char-table subtype)))
206 (map-char-table (lambda (c v) (set-char-table-range ct2 c v)) ct1)
207 (dotimes (i (or (get subtype 'char-table-extra-slots) 0))
208 (set-char-table-extra-slot ct2 i (char-table-extra-slot ct1 i)))
211 (defun lex--char-table->alist (ct)
213 (map-char-table (lambda (k v)
214 (push (cons (if (consp k)
215 ;; If k is a cons cell, we have to
216 ;; copy it because map-char-table
218 (cons (car k) (cdr k))
219 ;; Otherwise, create a trivial cons-cell
220 ;; so we have fewer cases to handle.
227 (defun lex--merge-into (op al1 al2 ct)
228 (cl-assert (memq op '(and or orelse)))
229 ;; We assume that map-char-table calls its function with increasing
232 (let ((k1 (caar al1)) (k2 (caar al2)))
236 (set-char-table-range ct k1
237 (lex--merge op (cdr (pop al1)) (cdr (pop al2)))))
238 ;; k1 strictly greater than k2.
239 ((and (consp k1) (consp k2) (> (car k1) (cdr k2)))
240 (let ((v (cdr (pop al1))))
241 (if (not (eq op 'and)) (set-char-table-range ct k1 v))))
242 ;; k2 strictly greater than k1.
243 ((and (consp k1) (consp k2) (> (car k2) (cdr k1)))
244 (let ((v (cdr (pop al2))))
245 (if (not (eq op 'and)) (set-char-table-range ct k2 v))))
246 ;; There's partial overlap.
247 ((and (consp k1) (consp k2) (> (cdr k1) (cdr k2)))
248 (if (not (eq op 'and))
249 (set-char-table-range ct (cons (1+ (cdr k2)) (cdr k1)) (cdar al1)))
250 (setcdr k1 (cdr k2)))
251 ((and (consp k1) (consp k2) (< (cdr k1) (cdr k2)))
252 (if (not (eq op 'and))
253 (set-char-table-range ct (cons (1+ (cdr k1)) (cdr k2)) (cdar al2)))
254 (setcdr k2 (cdr k1)))
255 ;; Now the tails are equal.
256 ((and (consp k1) (consp k2) (> (car k1) (car k2)))
257 (set-char-table-range ct k1 (lex--merge op (cdr (pop al1)) (cdar al2)))
258 (setcdr k2 (1- (car k1))))
259 ((and (consp k1) (consp k2) (< (car k1) (car k2)))
260 (set-char-table-range ct k2 (lex--merge op (cdar al1) (cdr (pop al2))))
261 (setcdr k1 (1- (car k2))))
262 (t (cl-assert nil)))))
263 (if (not (eq op 'and))
264 (dolist (x (or al1 al2))
265 (set-char-table-range ct (car x) (cdr x))))
269 (defvar lex--memoize)
271 (defun lex--set-eq (l1 l2)
272 (let ((len (length l2)))
273 (setq l2 (copy-sequence l2))
275 (cl-assert (= len (length l2)))
277 (setq len (length (setq l2 (delq (pop l1) l2)))))
281 (define-hash-table-test 'lex--set-eq 'lex--set-eq
286 (if (memq x l) (progn (debug) nil)
287 (setq hash (+ hash (sxhash x))))))
291 (defun lex--flatten-state (state)
292 (cl-assert (memq (car state) '(and or orelse)))
293 (let ((op (car state))
298 (setq state (pop todo))
300 ((null state) (if (eq op 'and) (setq res nil todo nil)))
301 ((memq state done) nil)
302 ((eq (car-safe state) op)
304 (setq todo (append (cdr state) todo)))
305 (t (unless (memq state res) (push state res)))))
306 (cons op (nreverse res))))
308 (defun lex--merge-2 (op lex1 lex2)
309 (cl-assert (memq op '(and or orelse)))
310 ;; The order between lex1 and lex2 matters: preference is given to lex1.
312 ;; `lex1' and `lex2' might actually be the same when we use this code to
313 ;; cancel out the `and' and the `join' from lex--merge-and-join.
314 ;; ((eq lex1 lex2) (debug) lex1) ;CHECK: ruled out by `lex--flatten-state'?
315 ;; ((equal lex1 lex2) lex1) ;Stack overflow :-(
317 ;; Handle the 2 possible nil cases.
318 ;; CHECK: ruled out by `lex--flatten-state'?
319 ((null lex1) (debug) (if (eq op 'and) nil lex2))
320 ((null lex2) (debug) (if (eq op 'and) nil lex1))
322 ;; Do the predicate cases before the `stop' because the stop should
323 ;; always come after the checks.
324 ;; TODO: add optimizations for pairs of `checks' which are redundant,
325 ;; or mutually exclusive, ... although we can also do it in lex-optimize.
326 ((and (eq (car lex1) 'check) (eq (car lex2) 'check)
327 (equal (nth 1 lex1) (nth 1 lex2))) ; Same predicate.
328 (cl-list* 'check (nth 1 lex1)
329 (lex--merge op (nth 2 lex1) (nth 2 lex2))
330 (lex--merge op (nthcdr 3 lex1) (nthcdr 3 lex2))))
331 ((eq (car lex1) 'check)
332 (cl-list* 'check (nth 1 lex1)
333 (lex--merge op (nth 2 lex1) lex2)
334 (lex--merge op (nthcdr 3 lex1) lex2)))
335 ((eq (car lex2) 'check)
336 (cl-list* 'check (nth 1 lex2)
337 (lex--merge op lex1 (nth 2 lex2))
338 (lex--merge op lex1 (nthcdr 3 lex2))))
340 ;; Joins have the form (join CONT . EXIT) where EXIT is a lexer
341 ;; corresponding to the rest of the regexp after the `and' sub-regexp.
342 ;; All the joins corresponding to the same `and' have the same EXIT.
343 ;; CONT is a lexer that contains another join inside, it corresponds to
344 ;; the decision to not yet leave the `and'.
345 ((and (eq (car lex1) 'join) (eq (car lex2) 'join))
346 (cl-assert (eq (cddr lex1) (cddr lex2))) ;Check they're the same join.
347 (let ((in (lex--merge op (cadr lex1) (cadr lex2))))
349 ;; Eliminate the join once it was all merged.
350 ;; FIXME: This arbitrarily chooses `or' instead of `orelse',
351 ;; and it arbitrarily gives CONT precedence over EXIT.
352 (lex--merge 'or in (cddr lex1))
353 `(join ,in ,@(cddr lex1)))))
354 ;; If one the two lex's is a join but the other not, the other must
355 ;; contain a corresponding join somewhere inside.
356 ((eq (car lex1) 'join)
357 (let ((next (lex--merge op (nth 1 lex1) lex2)))
358 ;; lex1 is a valid exit point but lex2 isn't.
361 ;; FIXME: lex1 is implicitly an `or(else)' between (cadr lex1) and
362 ;; (cddr lex1). Here we construct an `or(else)' between `next' and
363 ;; (cddr lex1). I.e. we lose the `op' and we do not preserve the
364 ;; ordering between lex2 and (cddr lex1).
365 `(join ,next ,@(cddr lex1)))))
366 ((eq (car lex2) 'join)
367 (let ((next (lex--merge op lex1 (nth 1 lex2))))
368 (if (eq op 'and) next `(join ,next ,@(cddr lex2)))))
370 ;; The three `stop' cases.
371 ((and (eq (car lex1) 'stop) (eq (car lex2) 'stop))
372 ;; Here is where we give precedence to `lex1'.
373 (if (eq op 'orelse) lex1
374 (cl-list* 'stop (cadr lex1) (lex--merge op (cddr lex1) (cddr lex2)))))
375 ((eq (car lex1) 'stop)
376 (let ((next (lex--merge op (cddr lex1) lex2)))
378 (`or (cl-list* 'stop (cadr lex1) next))
380 ;; CHECK: We should have hit a `join' before reaching a `stop'.
382 (_ (error "lex.el: got %S but expected one of or/and/orelse"
384 ((eq (car lex2) 'stop)
385 (let ((next (lex--merge op lex1 (cddr lex2))))
386 ;; For `orelse', we want here to delay the `stop' until the point
387 ;; where we know that lex1 doesn't match. Sadly, I don't know how to
390 ;; FIXME: One thing we can do is to mark the value attached to the
391 ;; `stop' so as to indicate that an earlier match may finish later.
392 ;; This way, if the match is not `earlystop', we know it's one of
393 ;; the leftmost ones, and maybe the search loop can avoid some work
394 ;; when determining which is the leftmost longest match.
395 (`orelse (cl-list* 'stop `(earlystop ,(cadr lex2)) next))
396 ((or `or `orelse) (cl-list* 'stop (cadr lex2) next))
397 ;; CHECK: We should have hit a `join' before reaching a `stop'.
399 (_ (error "lex.el: got %S but expected one of or/and/orelse"
402 ;; The most general case.
403 ((and (eq (car lex1) 'table) (eq (car lex2) 'table))
404 (let ((al1 (lex--char-table->alist (cdr lex1)))
405 (al2 (lex--char-table->alist (cdr lex2)))
406 (ct (make-char-table 'lexer)))
407 (lex--merge-into op al1 al2 ct)
410 ((and (characterp (car lex1)) (characterp (car lex2))
411 (eq (car lex1) (car lex2)))
412 (cons (car lex1) (lex--merge op (cdr lex1) (cdr lex2))))
413 ((and (characterp (car lex1)) (characterp (car lex2)))
415 (let ((ct (make-char-table 'lexer)))
416 (aset ct (car lex1) (cdr lex1))
417 (aset ct (car lex2) (cdr lex2))
419 ((and (characterp (car lex1)) (eq (car lex2) 'table))
420 (let ((next (lex--merge op (cdr lex1) (aref (cdr lex2) (car lex1)))))
422 (if next (cons (car lex1) next))
423 (let ((ct (copy-sequence (cdr lex2))))
424 (aset ct (car lex1) next)
426 ((and (eq (car lex1) 'table) (characterp (car lex2)))
427 (let ((next (lex--merge op (aref (cdr lex1) (car lex2)) (cdr lex2))))
429 (if next (cons (car lex2) next))
430 (let ((ct (copy-sequence (cdr lex1))))
431 (aset ct (car lex2) next)
434 ((or (memq (car lex1) '(or orelse and)) ;state
435 (memq (car lex2) '(or orelse and))) ;state
436 ;; `state' nodes are nodes whose content is not known yet, so we
437 ;; have to delay the merge via the memoization table.
438 ;; `or' and `and' nodes should only happen when the other `op' is being
439 ;; performed, in which case we can't do the merge either before lex1
440 ;; and lex2 have both been merged.
441 (lex--merge op lex1 lex2))
442 (t (cl-assert nil))))
444 (defun lex--merge-now (&rest state)
445 (cl-assert (memq (car state) '(and or orelse)))
446 ;; Re-flatten, in case one of the sub-states was changed.
447 (setq state (lex--flatten-state state))
448 (if (<= (length state) 2)
449 (if (eq (car state) 'and)
450 ;; Need to strip out the `join's.
451 (lex--merge-and-join (cadr state))
453 (let ((op (pop state))
456 ;; CHECK: we fold the lexers using left-associativity.
457 ;; For `orelse', that means that `earlystop' never accumulates,
458 ;; whereas if we folded in a right-associative way, we could get
459 ;; some (earlystop (earlystop (earlystop V))). Not sure which one's
460 ;; preferable, so let's stick with what we have for now.
461 (setq res (lex--merge-2 op res lex)))
464 (defun lex--merge-and-join (lex)
465 (lex--merge-2 'and lex lex))
468 (defun lex--merge (&rest state)
469 (cl-assert (memq (car state) '(and or orelse)))
470 (setq state (lex--flatten-state state))
471 (if (and (<= (length state) 2)
472 (not (eq (car state) 'and)))
474 (or (gethash state lex--memoize)
477 (cl-assert (memq (car state) '(and or orelse)))
478 (push state lex--states)
479 ;; The `state' node will be later on modified via setcar/setcdr,
480 ;; se be careful to use a copy of it for the key.
481 (puthash (cons (car state) (cdr state)) state lex--memoize)
484 (defun lex--compile-category (category)
485 (if (and (integerp category) (< category 128))
487 (if (symbolp category)
488 (if (= 1 (length (symbol-name category)))
489 (aref (symbol-name category) 0)
491 (defvar rx-categories)
492 (cdr (assq category rx-categories))))))
494 (defun lex--compile-syntax (&rest syntaxes)
496 (if (and (integerp x) (< x 32)) x
498 (setq x (if (= 1 (length (symbol-name x)))
502 (cdr (assq x rx-syntax)))))
503 (if (characterp x) (setq x (string x)))
504 (car (string-to-syntax x))))
507 (defconst lex--char-classes
508 `((alnum alpha digit)
509 (alpha word (?a . ?z) (?A . ?Z))
511 (cntrl (?\0 . ?\C-_))
513 ;; Include all multibyte chars, plus all the bytes except 128-159.
514 (graph (?! . ?~) multibyte (#x3fffa0 . #x3fffff))
515 ;; src/regexp.c handles case-folding inconsistently: lower and upper
516 ;; match both lower- and uppercase ascii chars, but lower also matches
517 ;; uppercase non-ascii chars whereas upper does not match lowercase
518 ;; nonascii chars. Here I simply ignore case-fold for [:lower:] and
519 ;; [:upper:] because it's simpler and doesn't seem worse.
520 (lower (check (lex--match-lower)))
521 (upper (check (lex--match-upper)))
523 (punct (check (not (lex--match-syntax . ,(lex--compile-syntax "w"))))
524 (?! . ?/) (?: . ?@) (?\[ . ?`) (?\{ . ?~))
525 (space (check (lex--match-syntax . ,(lex--compile-syntax " "))))
526 (xdigit digit (?a . ?f) (?A . ?F))
527 (ascii (?\0 . ?\177))
528 (nonascii (?\200 . #x3fffff))
529 (unibyte ascii (#x3fff00 . #x3fffff))
530 (multibyte (#x100 . #x3ffeff))
531 (word (check (lex--match-syntax . ,(lex--compile-syntax "w"))))
532 ;; `rx' alternative names.
548 "Definition of char classes.
549 Each element has the form (CLASS . DEFINITION) where definition
550 is a list of elements that can be either CHAR or (CHAR . CHAR),
551 or CLASS (another char class) or (check (PREDICATE . ARG))
552 or (check (not (PREDICATE . ARG))).")
554 (defvar lex--char-equiv-table nil
555 "Equiv-case table to use to compile case-insensitive regexps.")
557 (defun lex--char-equiv (char)
558 (when lex--char-equiv-table
561 (while (and (setq tmp (aref lex--char-equiv-table tmp))
564 (if chars (cons char chars)))))
566 ;; For convenience we use lex itself to tokenize charset strings, so we
567 ;; define it in another file.
568 (autoload 'lex--parse-charset "lex-parse-re")
570 (defun lex--nfa (re state)
571 (cl-assert state) ;If `state' is nil we can't match anyway.
574 (let ((chars (lex--char-equiv re)))
577 (let ((ct (make-char-table 'lexer)))
578 (dolist (char chars) (aset ct char state))
581 (if (null lex--char-equiv-table)
582 ;; (Very) minor optimization.
583 (nconc (mapcar 'identity re) state)
584 (lex--nfa `(seq ,@(mapcar 'identity re)) state)))
586 (pcase (or (car-safe re) re)
587 ((or `: `seq `sequence
590 (dolist (elem (reverse (cdr re)))
591 (setq state (lex--nfa elem state)))
593 ((or `char `in `not-char)
594 (let ((chars (cdr re))
597 (char nil) ;The char seen, or nil if none, or t if more than one.
598 (ct (make-char-table 'lexer)))
599 (when (or (eq 'not (car chars)) (eq 'not-char (car re)))
600 (setq chars (cdr chars))
601 (set-char-table-range ct t state)
605 (let ((range (pop chars)))
608 (setq chars (append (cdr (lex--parse-charset range)) chars)))
610 (setq range (or (cdr (assq range lex--char-classes))
611 (error "Uknown char class `%s'" range)))
612 (setq chars (append range chars)))
613 ((and (consp range) (eq 'check (car range)))
614 (push (cadr range) checks))
616 (setq char (if (or char (not (characterp range))
617 (and lex--char-equiv-table
618 (lex--char-equiv range)))
620 ;; Set the range, first, regardless of case-folding. This is
621 ;; important because case-tables like to be set with few
622 ;; large ranges rather than many small ones, as is done in
623 ;; the case-fold loop.
624 (set-char-table-range ct range state)
625 (when (and lex--char-equiv-table
626 ;; Avoid looping over all characters.
627 (not (equal range '(#x100 . #x3ffeff))))
628 ;; Add all the case-equiv chars.
629 (let ((i (if (consp range) (car range) range))
630 (max (if (consp range) (cdr range) range))
634 (while (and (setq char (aref lex--char-equiv-table char))
636 (aset ct char state))
637 (setq i (1+ i)))))))))
639 (let ((res (if (or (eq char t) fail)
641 (if char (cons char state)))))
642 (if (and (not fail) checks)
643 (setq state (lex--nfa 'anything state)))
644 (dolist (check checks)
647 ;; We do an `and' of the negation of the check and res.
648 (if (eq (car-safe check) 'not)
649 (list 'check (cadr check) res)
650 (cl-list* 'check check nil res))
651 ;; An `or' of the check and res.
652 (if (eq (car-safe check) 'not)
653 (list 'check (cadr check) res state)
654 (cl-list* 'check check state res)))))
657 ((or `union `or `| `orelse)
659 (cons (if (eq (car re) 'orelse) 'orelse 'or)
660 (mapcar (lambda (re) (lex--nfa re state)) (cdr re)))))
661 (push newstate lex--states)
664 ((or `inter `intersection `&)
665 (if (<= (length re) 2)
666 ;; Avoid constructing degenerate `and' nodes.
667 (lex--nfa (cadr re) state)
668 ;; Just using `and' is not enough because we have to enforce that the
669 ;; sub-regexps (rather than the whole regexp) match the same string.
670 ;; So we need to mark the juncture point.
671 (let* ((join `(join nil ,@state))
673 `(and ,@(mapcar (lambda (re) (lex--nfa re join)) (cdr re)))))
674 (push newstate lex--states)
677 ((or `0+ `zero-or-more `* `*\?)
678 (let ((newstate (list 'state)))
679 (let ((lexer (lex--nfa (cons 'seq (cdr re)) newstate)))
680 (setcdr newstate (if (memq (car re) '(*\?))
682 (list lexer state))))
683 (setcar newstate (if (memq (car re) '(*\?)) 'orelse 'or))
684 (push newstate lex--states)
687 ((or `string-end `eos `eot `buffer-end `eob)
688 `(check (lex--match-eobp) ,state))
689 ((or `string-start `bos `bot `buffer-start `bob)
690 `(check (lex--match-bobp) ,state))
691 ((or `line-end `eol) `(check (lex--match-eolp) ,state))
692 ((or `line-start `bol) `(check (lex--match-bolp) ,state))
693 ((or `word-start `bow) `(check (lex--match-bowp) ,state))
694 ((or `word-end `eow) `(check (lex--match-eowp) ,state))
695 (`symbol-start `(check (lex--match-bosp) ,state))
696 (`symbol-end `(check (lex--match-eosp) ,state))
697 (`not-word-boundary `(check (lex--match-not-word-boundary) ,state))
698 (`word-boundary `(check (lex--match-not-word-boundary) nil . ,state))
699 (`syntax `(check (lex--match-syntax
700 . ,(apply 'lex--compile-syntax (cdr re)))
701 ,(lex--nfa 'anything state)))
702 (`not-syntax `(check (lex--match-syntax
703 . ,(apply 'lex--compile-syntax (cdr re)))
704 nil . ,(lex--nfa 'anything state)))
705 (`category `(check (lex--match-category
706 . ,(lex--compile-category (cadr re)))
707 ,(lex--nfa 'anything state)))
708 (`not-category `(check (lex--match-category
709 . ,(lex--compile-category (cadr re)))
710 nil . ,(lex--nfa 'anything state)))
712 ;; `rx' accepts char-classes directly as regexps. Let's reluctantly
714 ((or `digit `numeric `num `control `cntrl `hex-digit `hex `xdigit `blank
715 `graphic `graph `printing `print `alphanumeric `alnum `letter
716 `alphabetic `alpha `ascii `nonascii `lower `lower-case `upper
717 `upper-case `punctuation `punct `space `whitespace `white)
718 (lex--nfa `(char ,re) state))
721 (let ((lex--char-equiv-table nil))
722 (lex--nfa `(seq ,@(cdr re)) state)))
725 (let ((lex--char-equiv-table
726 (case-table-get-table (current-case-table) 'eqv)))
727 (lex--nfa `(seq ,@(cdr re)) state)))
731 `submatch `group `backref
732 ;; Greediness control
733 `minimal-match `maximal-match)
734 (error "`%s' Not implemented" (or (car-safe re) re)))
736 ((or `not-newline `nonl `dot) (lex--nfa '(char not ?\n) state))
737 (`anything (lex--nfa '(char not) state))
738 ((or `word `wordchar) (lex--nfa '(syntax w) state))
739 (`not-wordchar (lex--nfa '(not-syntax w) state))
742 ;; `rx' uses it for (char ...) sets, and sregex uses it for `dot'.
743 (lex--nfa (if (consp re) (cons 'char (cdr re)) '(char not ?\n)) state))
746 ;; We could define negation directly on regexps, but it's easier to
747 ;; do it on NFAs since those have fewer cases to deal with.
749 ;; Trow away the mergable states generated while computing the
750 ;; posnfa, since it's only an intermediate datastructure.
752 (lex--nfa `(seq ,@(cdr re)) '(stop negate)))))
753 (lex-negate posnfa state)))
756 ;; The `not' as used in `rx' should be deprecated so we can make it
757 ;; an alias for `negate', whose semantics is different. E.g.
758 ;; (negate (char ...)) matches the empty string and 2-char strings.
760 (pcase (or (car-safe re) re)
762 (message "`not' deprecated: use not-word-boundary")
763 (lex--nfa 'not-word-boundary state))
765 (message "`not' deprecated: use (%s not ...)" (or (car-safe re) re))
766 (lex--nfa (cl-list* (car re) 'not (cdr re)) state))
767 ((or `category `syntax)
768 (message "`not' deprecated: use not-%s" (car re))
769 (lex--nfa (cons (intern (format "not-%s" (car re))) (cdr re)) state))
770 (elem (error "lex.el: unexpected argument `%S' to `not'." elem))))
773 ;; `rx' defined `and' as `sequence', but we may want to define it
774 ;; as intersection instead.
775 (error "`and' is deprecated, use `seq', `:', or `sequence' instead"))
777 ((or `1+ `one-or-more `+ `+\?)
778 (lex--nfa `(seq (seq ,@(cdr re))
779 (,(if (memq (car re) '(+\?)) '*\? '0+) ,@(cdr re)))
781 ((or `opt `zero-or-one `optional `\?)
782 (lex--nfa `(or (seq ,@(cdr re)) "") state))
784 (lex--nfa `(orelse "" (seq ,@(cdr re))) state))
786 (let ((min (nth 1 re))
790 (setq res (list max)) (setq max min))
791 (lex--nfa `(seq ,@(append (make-list (or min 0)
792 (if (eq (length res) 1)
797 (make-list (- max (or min 0))
800 (`>= (lex--nfa `(repeat ,(nth 1 re) nil ,@(nthcdr 2 re)) state))
803 (lex--nfa (lex-parse-re (nth 1 re) (car re)) state))
804 (elem (error "lex.el: unknown RE element %S" elem))))))
806 (defun lex--negate-inftail (state howmany)
807 ;; We hashcons the infinite tails and store them in the memoize table.
808 ;; This is an abuse, but saves us from passing it around as an
810 (let ((inftail-1+ (gethash state lex--memoize)))
812 ;; Precompute the final infinitely repeating tail.
813 (setq inftail-1+ `(table . ,(make-char-table 'lexer)))
814 (set-char-table-range (cdr inftail-1+) t `(or ,state ,inftail-1+))
815 (push (aref (cdr inftail-1+) 0) lex--states)
816 (puthash state inftail-1+ lex--memoize))
819 (`0+ (aref (cdr inftail-1+) 0))
820 (_ (error "lex.el: howmany is `%S' instead of one of 1+/0+" howmany)))))
822 (defun lex--negate-now (nfa state)
824 (`nil (lex--negate-inftail state '0+))
826 `(check ,(nth 1 nfa) ,(lex--negate-memo (nth 2 nfa) state)
827 ,@(lex--negate-memo (nthcdr 3 nfa) state)))
830 ;; This is valid but should normally not happen.
831 (lex--negate-now `(or (stop ,(cadr nfa)) ,(cddr nfa)) state)
832 (lex--negate-inftail state '1+)))
835 (let ((join `(join nil . ,state)))
836 `(and ,@(mapcar (lambda (nfa) (lex--negate-memo nfa join)) (cdr nfa)))))
839 `(or ,@(mapcar (lambda (nfa) (lex--negate-memo nfa state)) (cdr nfa))))
842 ;; The join says: either exit the `and' because we matched all branches,
843 ;; or keep matching further. Negation makes the synchrony between
844 ;; `and' branches irrelevant, so we can consider it as an `or(else)'.
846 ;; This is valid but should normally not happen.
847 (lex--negate-now `(or ,(cadr nfa) ,(cddr nfa)) state)
848 (lex-negate (cddr nfa) state)))
850 (let ((ct (make-char-table 'lexer)))
851 ;; Get inftail-0+ from the hashtable.
852 (set-char-table-range ct t (lex--negate-inftail state '0+))
853 (if (characterp (car nfa))
854 (aset ct (car nfa) (lex--negate-memo (cdr nfa) state))
855 (cl-assert (eq 'table (car nfa)))
856 (map-char-table (lambda (range nfa)
857 (set-char-table-range ct range
858 (lex--negate-memo nfa state)))
860 `(or ,state (table ,@ct))))))
862 (defun lex--negate-memo (nfa state)
863 ;; Make sure our `inftail' abuse of the hastable doesn't break anything.
864 (cl-assert (not (eq nfa state)))
865 (or (gethash nfa lex--memoize)
866 (let ((newstate (cons 'state nil)))
867 (puthash nfa newstate lex--memoize)
868 (let ((res (lex--negate-now nfa state)))
869 (when (memq (car res) '(or and orelse))
870 (push newstate lex--states))
873 (setcar newstate (car res))
874 (setcdr newstate (cdr res))
877 (defun lex-negate (nfa state)
878 "Concatenate the negation of NFA with STATE.
880 (let ((lex--memoize (make-hash-table :test 'eq)))
881 (lex--negate-memo nfa state)))
883 (defun lex--dfa-wrapper (f)
884 (let* ((lex--states ())
887 (lex--memoize (make-hash-table :test 'lex--set-eq))
888 (states-dfa (make-hash-table :test 'eq)))
891 (dolist (state (prog1 lex--states (setq lex--states nil)))
892 (let ((merged (apply 'lex--merge-now state)))
893 (if (memq (car merged) '(and or orelse))
894 ;; The merge could not be performed for some reason:
895 ;; let's re-schedule it.
896 (push state postponed)
897 (puthash state merged states-dfa))))
900 ;; If states-dfa is empty it means we haven't made any progress,
901 ;; so we're stuck in an infinite loop. Hopefully this cannot happen?
902 (cl-assert (not (zerop (hash-table-count states-dfa))))
903 (maphash (lambda (k v)
905 ;; With `intersection', lex--merge may end up returning
906 ;; nil if the intersection is empty, so `v' can be
907 ;; nil here. In since `k' is necessarily a cons cell,
908 ;; we can't turn it into nil, so we turn it into
909 ;; a more costly lexer that also fails for all inputs.
915 (setq lex--states postponed)
916 (setq postponed nil)))
920 (defun lex-compile (alist)
923 (let* ((lex--char-equiv-table
925 (case-table-get-table (current-case-table) 'eqv)))
928 ,@(mapcar (lambda (x) (lex--nfa (car x) (list 'stop (cdr x))))
930 (push newstate lex--states)
933 (defun lex-search-dfa (match-dfa)
934 ;; This constructs a search-DFA whose last match should be the leftmost
938 (lex--nfa '(*\? (char not)) match-dfa))))
941 (defun lex--terminate-if (new old)
945 (t (while new (let ((x (pop new))) (if (not (memq x old)) (push x old))))
948 (defun lex--optimize-1 (lexer)
949 (let ((terminate nil))
953 (let ((ct (cdr lexer))
955 ;; Optimize each entry.
958 (let ((cell (lex--optimize v)))
959 (setq terminate (lex--terminate-if (cdr cell) terminate))
960 (set-char-table-range ct range (car cell))))
962 ;; Optimize the internal representation of the table.
963 (optimize-char-table (cdr lexer) 'eq)
964 ;; Eliminate the table if possible.
968 (if (and (characterp range) (null char))
974 (_ (setcar lexer 'char) (setcdr lexer (aref ct char)) lexer))))
976 (let ((cell (lex--optimize (cddr lexer))))
978 (setf (cddr lexer) (car cell)))
981 (let* ((test (nth 1 lexer))
982 (cellf (lex--optimize (nthcdr 3 lexer)))
983 (fail (setf (nthcdr 3 lexer) (car cellf)))
984 (cells (lex--optimize (nth 2 lexer)))
985 (succ (setf (nth 2 lexer) (car cells))))
986 (setq terminate (lex--terminate-if (cdr cellf) terminate))
987 (setq terminate (lex--terminate-if (cdr cells) terminate))
988 ;; TODO: the check-optimizations below only work on consecutive
989 ;; pairs of checks. We need to be more agressive and make sure
990 ;; the optimized DFA never does twice the same test at the same
991 ;; position. Most importantly: don't do the same test in
992 ;; a tight loop as in "(^\<)*".
993 (when (eq 'check (car succ))
995 ((equal test (nth 1 succ)) ;Same successful test.
996 (setf (nth 2 lexer) (setq succ (nth 2 succ))))
997 ;; TODO: we can add rules such as bobp -> eolp,
998 ;; bosp -> bowp, (syntax X) -> (syntax Y X), ...
1000 (when (eq 'check (car fail))
1002 ((equal test (nth 1 fail)) ;Same failing test.
1003 (setf (nthcdr 3 lexer) (setq fail (nthcdr 3 succ))))
1004 ;; TODO: we can add rules such as !eolp -> !bobp,
1005 ;; !bowp -> !bosp, !(syntax Y X) -> !(syntax X), ...
1007 (if (or succ fail) lexer)))
1009 (cl-assert (characterp (car lexer)))
1010 (let ((cell (lex--optimize (cdr lexer))))
1011 (setq terminate (lex--terminate-if (cdr cell) terminate))
1012 (if (setf (cdr lexer) (car cell))
1014 (if (consp terminate)
1015 (delq lexer terminate)
1018 (defun lex--optimize (lexer)
1020 ;; The lex--memoize cache maps lexer states to (LEXER . TERMINATE) where
1021 ;; TERMINATE is either t to say that LEXER can terminate or a list of
1022 ;; lexers which means that LEXER terminates only if one of the lexers in
1023 ;; the list terminates.
1024 (let ((cache (gethash lexer lex--memoize)))
1026 ;; Optimize (char C) to nil.
1027 (if (and (characterp (caar cache)) (null (cdar cache))) nil cache)
1028 ;; Store a value indicating that we're in the process of computing it,
1029 ;; so when we encounter a loop, we don't recurse indefinitely.
1030 ;; Not knowing any better, we start by stating the tautology that
1031 ;; `lexer' terminates if and only if `lexer' terminates.
1032 (let ((cell (cons lexer (list lexer))))
1033 (puthash lexer cell lex--memoize)
1034 (let ((res (lex--optimize-1 lexer)))
1035 (if (and (car res) (cdr res))
1039 (puthash lexer '(nil) lex--memoize)
1042 (defun lex-optimize (lexer)
1043 (let ((lex--memoize (make-hash-table :test 'eq)))
1044 (prog1 (car (lex--optimize lexer))
1045 (message "Visited %d states" (hash-table-count lex--memoize)))))
1047 (defmacro lex-case (object posvar &rest cases)
1048 (declare (indent 2))
1050 (alist (mapcar (lambda (case) (cons (car case) (cl-incf i))) cases))
1051 (lex (lex-compile alist))
1052 (tmpsym (make-symbol "tmp")))
1054 `(let ((,tmpsym (lex-match-string ',lex ,object ,posvar)))
1055 (pcase (car ,tmpsym)
1056 ,@(mapcar (lambda (case)
1059 (list ,posvar (setq ,posvar (cadr ,tmpsym))))
1065 (defun lex--match-bobp (arg pos &optional string)
1066 (= pos (if string 0 (point-min))))
1068 (defun lex--match-eobp (arg pos &optional string)
1069 (= pos (if string (length string) (point-max))))
1071 (defun lex--match-bolp (arg pos &optional string)
1072 (if string (or (= pos 0) (eq ?\n (aref string (1- pos))))
1073 (memq (char-before pos) '(nil ?\n))))
1075 (defun lex--match-eolp (arg pos &optional string)
1076 (if string (or (= pos (length string)) (eq ?\n (aref string pos)))
1077 (memq (char-after pos) '(nil ?\n))))
1079 (defun lex--match-bowp (arg pos &optional string)
1080 (and (not (if string (and (> pos 0)
1081 (eq ?w (char-syntax (aref string (1- pos)))))
1082 (and (> pos (point-min)) (eq 2 (car (syntax-after (1- pos)))))))
1083 (if string (and (< pos (length string))
1084 (eq ?w (char-syntax (aref string pos))))
1085 (eq 2 (car (syntax-after pos))))))
1087 (defun lex--match-eowp (arg pos &optional string)
1088 (and (if string (and (> pos 0)
1089 (eq ?w (char-syntax (aref string (1- pos)))))
1090 (and (> pos (point-min)) (eq 2 (car (syntax-after (1- pos))))))
1091 (not (if string (and (< pos (length string))
1092 (eq ?w (char-syntax (aref string pos))))
1093 (eq 2 (car (syntax-after pos)))))))
1095 (defun lex--match-bosp (arg pos &optional string)
1096 (and (not (if string
1098 (memq (char-syntax (aref string (1- pos))) '(?w ?_)))
1099 (and (> pos (point-min))
1100 (memq (car (syntax-after (1- pos))) '(2 3)))))
1101 (if string (and (< pos (length string))
1102 (memq (char-syntax (aref string pos)) '(?w ?_)))
1103 (memq (car (syntax-after pos)) '(2 3)))))
1105 (defun lex--match-eosp (arg pos &optional string)
1106 (and (if string (and (> pos 0)
1107 (memq (char-syntax (aref string (1- pos))) '(?w ?_)))
1108 (and (> pos (point-min)) (memq (car (syntax-after (1- pos))) '(2 3))))
1109 (not (if string (and (< pos (length string))
1110 (memq (char-syntax (aref string pos)) '(?w ?_)))
1111 (memq (car (syntax-after pos)) '(2 3))))))
1113 (defun lex--match-not-word-boundary (arg pos &optional string)
1114 (eq (if string (and (> pos 0)
1115 (eq ?w (char-syntax (aref string (1- pos)))))
1116 (and (> pos (point-min)) (eq 2 (car (syntax-after (1- pos))))))
1117 (if string (and (< pos (length string))
1118 (eq ?w (char-syntax (aref string pos))))
1119 (eq 2 (car (syntax-after pos))))))
1121 (defun lex--match-upper (arg pos &optional string)
1122 (when (< pos (if string (length string) (point-max)))
1123 (let ((char (if string (aref string pos) (char-after pos))))
1124 (not (eq (downcase char) char)))))
1126 (defun lex--match-lower (arg pos &optional string)
1127 (when (< pos (if string (length string) (point-max)))
1128 (let ((char (if string (aref string pos) (char-after pos))))
1129 (not (eq (upcase char) char)))))
1132 (defun lex--match-category (category pos &optional string)
1133 (when (< pos (if string (length string) (point-max)))
1134 (aref (char-category-set (if string (aref string pos)
1138 (defun lex--match-syntax (syntaxes pos &optional string)
1139 (when (< pos (if string (length string) (point-max)))
1140 (memq (car (if string (aref (syntax-table) (aref string pos))
1141 (syntax-after pos)))
1145 (defun lex-match-string (lex string &optional start stop)
1146 "Match LEX against STRING between START and STOP.
1147 Return a triplet (VALUE ENDPOS . LEXER) where VALUE is the
1148 value of returned by the lexer for the match found (or nil), ENDPOS
1149 is the end position of the match found (or nil), and LEXER is the
1150 state of the engine at STOP, which can be passed back to
1151 `lex-match-string' to continue the match elsewhere."
1152 ;; FIXME: Move this to C.
1153 (unless start (setq start 0))
1154 (unless stop (setq stop (length string)))
1155 (let ((match (list nil nil))
1159 (while (eq (car lex) 'check)
1160 (setq lex (if (funcall (car (nth 1 lex)) (cdr (nth 1 lex))
1162 (nth 2 lex) (nthcdr 3 lex))))
1163 (when (eq (car lex) 'stop)
1164 ;; Don't stop yet, we're looking for the longest match.
1165 (setq match (list (cadr lex) start))
1166 (message "Found match: %s" match)
1167 (setq lex (cddr lex)))
1168 (cl-assert (not (eq (car lex) 'stop)))
1169 (and lex (< start stop)))
1170 (let ((c (aref string start)))
1171 (setq start (1+ start))
1173 ((eq (car lex) 'table) (aref (cdr lex) c))
1174 ((integerp (car lex)) (if (eq c (car lex)) (cdr lex)))))
1175 (setq lastlex lex)))
1176 (message "Final search pos considered: %s" start)
1177 ;; The difference between `lex' and `lastlex' is basically that `lex'
1178 ;; may depend on data after `stop' (if there was an `end-of-file' or
1179 ;; `word-boundary' or basically any `check'). So let's return `lastlex'
1180 ;; so it can be correctly used to continue the match with a different
1181 ;; content than what's after `stop'.
1182 (nconc match lastlex)))
1184 (defun lex-match-string-first (lex string &optional start stop)
1185 "Match LEX against STRING between START and STOP.
1186 Return a triplet (VALUE ENDPOS . LEXER) where VALUE is the
1187 value of returned by the lexer for the match found (or nil), ENDPOS
1188 is the end position of the match found (or nil), and LEXER is the
1189 state of the engine at STOP, which can be passed back to
1190 `lex-match-string' to continue the match elsewhere."
1191 ;; FIXME: Move this to C.
1192 (unless start (setq start 0))
1193 (unless stop (setq stop (length string)))
1194 (let ((match (list nil nil))
1199 (while (eq (car lex) 'check)
1200 (setq lex (if (funcall (car (nth 1 lex)) (cdr (nth 1 lex))
1202 (nth 2 lex) (nthcdr 3 lex))))
1203 (when (eq (car lex) 'stop)
1204 (throw 'found (cl-list* (cadr lex) start (cddr lex))))
1205 (cl-assert (not (eq (car lex) 'stop)))
1206 (and (not match) lex (< start stop)))
1207 (let ((c (aref string start)))
1208 (setq start (1+ start))
1210 ((eq (car lex) 'table) (aref (cdr lex) c))
1211 ((integerp (car lex)) (if (eq c (car lex)) (cdr lex)))))
1212 (setq lastlex lex)))
1213 ;; The difference between `lex' and `lastlex' is basically that `lex'
1214 ;; may depend on data after `stop' (if there was an `end-of-file' or
1215 ;; `word-boundary' or basically any `check'). So let's return `lastlex'
1216 ;; so it can be correctly used to continue the match with a different
1217 ;; content than what's after `stop'.
1218 (cl-list* nil start lastlex))))
1220 (defun lex-match-buffer (lex &optional stop)
1221 "Match LEX against buffer between point and STOP.
1222 Return a triplet (VALUE ENDPOS . LEXER) where VALUE is the
1223 value of returned by the lexer for the match found (or nil), ENDPOS
1224 is the end position of the match found (or nil), and LEXER is the
1225 state of the engine at STOP, which can be passed back to
1226 continue the match elsewhere."
1227 ;; FIXME: Move this to C.
1228 (unless stop (setq stop (point-max)))
1229 (let ((start (point))
1230 (match (list nil nil))
1234 (while (eq (car lex) 'check)
1235 (setq lex (if (funcall (car (nth 1 lex)) (cdr (nth 1 lex))
1237 (nth 2 lex) (nthcdr 3 lex))))
1238 (when (eq (car lex) 'stop)
1239 ;; Don't stop yet, we're looking for the longest match.
1240 (setq match (list (cadr lex) start))
1241 (message "Found match: %s" match)
1242 (setq lex (cddr lex)))
1243 (cl-assert (not (eq (car lex) 'stop)))
1244 (and lex (< start stop)))
1245 (let ((c (char-after start)))
1246 (setq start (1+ start))
1248 ((eq (car lex) 'table) (aref (cdr lex) c))
1249 ((integerp (car lex)) (if (eq c (car lex)) (cdr lex)))))
1250 (setq lastlex lex)))
1251 (message "Final search pos considered: %s" start)
1252 ;; The difference between `lex' and `lastlex' is basically that `lex'
1253 ;; may depend on data after `stop' (if there was an `end-of-file' or
1254 ;; `word-boundary' or basically any `check'). So let's return `lastlex'
1255 ;; so it can be correctly used to continue the match with a different
1256 ;; content than what's after `stop'.
1257 (nconc match lastlex)))
1260 ;;; lex.el ends here