1 ;;; tNFA.el --- Tagged non-deterministic finite-state automata
3 ;; Copyright (C) 2008-2010, 2012 Free Software Foundation, Inc
5 ;; Author: Toby Cubitt <toby-predictive@dr-qubit.org>
7 ;; Keywords: extensions, matching, data structures
8 ;; tNFA, NFA, DFA, finite state automata, automata, regexp
9 ;; Package-Requires: ((queue "0.1"))
10 ;; URL: http://www.dr-qubit.org/emacs.php
11 ;; Repository: http://www.dr-qubit.org/git/predictive.git
13 ;; This file is part of Emacs.
15 ;; GNU Emacs is free software: you can redistribute it and/or modify it under
16 ;; the terms of the GNU General Public License as published by the Free
17 ;; Software Foundation, either version 3 of the License, or (at your option)
20 ;; GNU Emacs is distributed in the hope that it will be useful, but WITHOUT
21 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22 ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
25 ;; You should have received a copy of the GNU General Public License along
26 ;; with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
31 ;; A tagged, non-deterministic finite state automata (NFA) is an abstract
32 ;; computing machine that recognises regular languages. In layman's terms,
33 ;; they are used to decide whether a string matches a regular expression. The
34 ;; "tagged" part allows the NFA to do group-capture: it returns information
35 ;; about which parts of a string matched which subgroup of the regular
38 ;; Why re-implement regular expression matching when Emacs comes with
39 ;; extensive built-in support for regexps? Primarily, because some algorithms
40 ;; require access to the NFA states produced part way through the regular
41 ;; expression matching process (see the trie.el package for an
42 ;; example). Secondarily, because Emacs regexps only work on strings, whereas
43 ;; regular expressions can usefully be used in Elisp code to match other
44 ;; sequence types, not just strings.
46 ;; A tagged NFA can be created from a regular expression using
47 ;; `tNFA-from-regexp', and its state can be updated using
48 ;; `tNFA-next-state'. You can discover whether a state is a matching state
49 ;; using `tNFA-match-p', extract subgroup capture data from it using
50 ;; `tNFA-group-data', check whether a state has any wildcard transitions using
51 ;; `tNFA-wildcard-p', and get a list of non-wildcard transitions using
52 ;; `tNFA-transitions'. Finally, `tNFA-regexp-match' uses tagged NFAs to decide
53 ;; whether a regexp matches a given string.
55 ;; Note that Emacs' regexps are not regular expressions in the original
56 ;; meaning of that phrase. Emacs regexps implement additional features (in
57 ;; particular, back-references) that allow them to match far more than just
58 ;; regular languages. This comes at a cost: regexp matching can potentially be
59 ;; very slow (NP-hard in fact, though the hard cases rarely crop up in
60 ;; practise), whereas there are efficient (polynomial-time) algorithms for
61 ;; matching regular expressions (in the original sense). Therefore, this
62 ;; package only supports a subset of the full Emacs regular expression
63 ;; syntax. See the function docstrings for more information.
65 ;; This package essentially implements Laurikari's algorithm, as described in
66 ;; his master's thesis, but it builds the corresponding tagged deterministic
67 ;; finite state automaton (DFA) on-the-fly as needed.
69 ;; This package uses the queue package queue.el.
75 ;; * work-around mysterious byte-compiler bug by defining
76 ;; `tNFA--NFA-state-create' and `tNFA--NFA-state-create-tag' via `defun'
77 ;; instead of directly in `defstruct'
86 (eval-when-compile (require 'cl))
91 ;;; ================================================================
92 ;;; Replcements for CL functions
94 (defun* tNFA--assoc (item alist &key (test 'eq))
95 ;; Return first cons cell in ALIST whose CAR matches ITEM according to
96 ;; :test function (defaulting to `eq')
98 (or (not (consp (car alist)))
99 (not (funcall test item (caar alist)))))
100 (setq alist (cdr alist)))
105 ;;; ================================================================
108 ;;; ----------------------------------------------------------------
109 ;;; tagged NFA states
114 (:constructor tNFA--state-create-initial
115 (NFA-state num-tags min-tags max-tags
117 (tags (tNFA--tags-create num-tags min-tags max-tags))))
118 (:constructor tNFA--state-create (NFA-state tags))
122 (defmacro tNFA--state-id (state)
123 `(tNFA--NFA-state-id (tNFA--state-NFA-state ,state)))
125 (defmacro tNFA--state-type (state)
126 `(tNFA--NFA-state-type (tNFA--state-NFA-state ,state)))
128 (defmacro tNFA--state-label (state)
129 `(tNFA--NFA-state-label (tNFA--state-NFA-state ,state)))
131 (defmacro tNFA--state-in-degree (state)
132 `(tNFA--NFA-state-in-degree (tNFA--state-NFA-state ,state)))
134 (defmacro tNFA--state-next (state)
135 `(tNFA--NFA-state-next (tNFA--state-NFA-state ,state)))
137 (defmacro tNFA--state-count (state)
138 `(tNFA--NFA-state-count (tNFA--state-NFA-state ,state)))
142 ;;; ----------------------------------------------------------------
145 (declare (special NFA--state-id))
151 (:constructor tNFA---NFA-state-create
152 (&optional type label next
156 (id (incf NFA--state-id))
159 ;; (setf (tNFA--NFA-state-count next)
160 ;; (incf (tNFA--NFA-state-in-degree next)))))
162 (:constructor tNFA--NFA-state-create-branch
168 (id (incf NFA--state-id))))
169 (:constructor tNFA---NFA-state-create-tag
176 (id (incf NFA--state-id))
179 ;; (setf (tNFA--NFA-state-count next)
180 ;; (incf (tNFA--NFA-state-in-degree next)))))
183 id type label in-degree
184 count tNFA-state ; used internally in NFA evolution algorithms
188 ;; Define these via defun instead of using the dummy argument in the
189 ;; above defstruct to work around a mysterious byte-compiler bug.
191 (defun tNFA--NFA-state-create (&optional type label next)
193 (setf (tNFA--NFA-state-count next)
194 (incf (tNFA--NFA-state-in-degree next))))
195 (tNFA---NFA-state-create type label next))
197 (defun tNFA--NFA-state-create-tag (tag &optional next)
199 (setf (tNFA--NFA-state-count next)
200 (incf (tNFA--NFA-state-in-degree next))))
201 (tNFA---NFA-state-create-tag tag next))
204 ;; tag number for a tagged epsilon transition is stored in label slot
205 (defalias 'tNFA--NFA-state-tag 'tNFA--NFA-state-label)
207 (defmacro tNFA--NFA-state-tags (state)
208 `(tNFA--state-tags (tNFA--NFA-state-tNFA-state ,state)))
211 (defun tNFA--NFA-state-patch (attach state)
212 ;; patch STATE onto ATTACH. Return value is meaningless
214 (tNFA--NFA-state-type attach)
215 (tNFA--NFA-state-type state)
216 (tNFA--NFA-state-label attach)
217 (tNFA--NFA-state-label state)
218 (tNFA--NFA-state-next attach)
219 (tNFA--NFA-state-next state)
220 (tNFA--NFA-state-count state)
221 (incf (tNFA--NFA-state-in-degree state))))
224 (defun tNFA--NFA-state-make-epsilon (state next)
225 ;; create an epsilon transition from STATE to NEXT
227 (tNFA--NFA-state-type state) 'epsilon
228 (tNFA--NFA-state-label state) nil
229 (tNFA--NFA-state-next state) next
230 (tNFA--NFA-state-count next)
231 (incf (tNFA--NFA-state-in-degree next))))
234 (defun tNFA--NFA-state-make-branch (state next)
235 ;; create a branch from STATE to all states in NEXT list
236 (setf (tNFA--NFA-state-type state) 'branch
237 (tNFA--NFA-state-label state) nil
238 (tNFA--NFA-state-next state) next)
240 (setf (tNFA--NFA-state-count n)
241 (incf (tNFA--NFA-state-in-degree n)))))
244 (defun tNFA--NFA-state-copy (state)
245 ;; Return a copy of STATE. The next link is *not* copied, it is `eq'
246 ;; to the original next link. Use `tNFA--fragment-copy' if you want to
247 ;; recursively copy a chain of states. Note: NFA--state-id must be
248 ;; bound to something appropriate when this function is called.
249 (let ((copy (copy-sequence state)))
250 (setf (tNFA--NFA-state-id copy) (incf NFA--state-id))
255 ;;; ----------------------------------------------------------------
262 (:constructor tNFA--fragment-create (initial final))
267 (defun tNFA--fragment-patch (frag1 frag2)
268 ;; patch FRAG2 onto end of FRAG1; return value is meaningless
269 (tNFA--NFA-state-patch (tNFA--fragment-final frag1)
270 (tNFA--fragment-initial frag2))
271 (setf (tNFA--fragment-final frag1) (tNFA--fragment-final frag2)))
274 (defun tNFA--fragment-copy (fragment)
275 ;; return a copy of FRAGMENT.
276 (declare (special copied-states))
278 (tNFA--fragment-create
279 (tNFA--do-fragment-copy (tNFA--fragment-initial fragment))
280 (cdr (assq (tNFA--fragment-final fragment) copied-states)))))
283 (defun tNFA--do-fragment-copy (state)
284 ;; return a copy of STATE, recursively following and copying links
285 ;; (note: NFA--state-id must be bound to something appropriate when
287 (declare (special copied-states))
288 (let ((copy (tNFA--NFA-state-copy state)))
289 (push (cons state copy) copied-states)
291 ;; if STATE is a branch, copy all links
293 ((eq (tNFA--NFA-state-type copy) 'branch)
294 (setf (tNFA--NFA-state-next copy)
295 (mapcar (lambda (next)
296 (or (cdr (assq next copied-states))
297 (tNFA--do-fragment-copy next)))
298 (tNFA--NFA-state-next copy))))
300 ;; if state doesn't have a next link, return
301 ((or (eq (tNFA--NFA-state-type copy) 'match)
302 (null (tNFA--NFA-state-type copy))))
304 ;; otherwise, copy next link
305 ((tNFA--NFA-state-type copy)
306 ;; for a non-branch STATE, copy next link
307 (setf (tNFA--NFA-state-next copy)
308 ;; if we've already copied next state, set next link to that
309 (or (cdr (assq (tNFA--NFA-state-next copy) copied-states))
310 ;; otherwise, recursively copy next state
311 (tNFA--do-fragment-copy (tNFA--NFA-state-next copy))))))
316 ;;; ----------------------------------------------------------------
323 (:constructor tNFA--DFA-state--create
330 list transitions test wildcard match pool)
333 (defun* tNFA--DFA-state-create (state-list state-pool &key (test 'eq))
334 ;; create DFA state and add it to the state pool
335 (let ((DFA-state (tNFA--DFA-state--create
336 state-list state-pool :test test))
338 (puthash state-list DFA-state (tNFA--DFA-state-pool DFA-state))
340 (dolist (state state-list)
341 ;; if state in state list is...
343 ;; literal state: add literal transition
344 ((eq (tNFA--state-type state) 'literal)
345 (setq tmp-list (tNFA--DFA-state-transitions DFA-state))
346 (add-to-list 'tmp-list (cons (tNFA--state-label state) t))
347 (setf (tNFA--DFA-state-transitions DFA-state) tmp-list))
349 ;; character alternative: add transitions for all alternatives
350 ((eq (tNFA--state-type state) 'char-alt)
351 (dolist (c (tNFA--state-label state))
352 (setq tmp-list (tNFA--DFA-state-transitions DFA-state))
353 (add-to-list 'tmp-list (cons c t))
354 (setf (tNFA--DFA-state-transitions DFA-state) tmp-list)))
356 ;; wildcard or negated character alternative: add wildcard
358 ((or (eq (tNFA--state-type state) 'wildcard)
359 (eq (tNFA--state-type state) 'neg-char-alt))
360 (setf (tNFA--DFA-state-wildcard DFA-state) t))
362 ;; match state: set match tags
363 ((eq (tNFA--state-type state) 'match)
364 (setf (tNFA--DFA-state-match DFA-state)
365 (tNFA--state-tags state)))))
367 ;; return constructed state
371 (defun* tNFA--DFA-state-create-initial (state-list &key (test 'eq))
372 ;; create initial DFA state from initial tNFA state INITIAL-STATE
373 (tNFA--DFA-state-create state-list
374 (make-hash-table :test 'equal)
378 (defalias 'tNFA-match-p 'tNFA--DFA-state-match
379 "Return non-nil if STATE is a matching state, otherwise return nil.")
382 (defalias 'tNFA-wildcard-p 'tNFA--DFA-state-wildcard
383 "Return non-nil if STATE has a wildcard transition,
384 otherwise return nil.")
387 (defun tNFA-transitions (state)
388 "Return list of literal transitions from tNFA state STATE."
389 (mapcar 'car (tNFA--DFA-state-transitions state)))
393 ;;; ----------------------------------------------------------------
396 (defun tNFA--tags-create (num-tags min-tags max-tags)
397 ;; construct a new tags table
398 (let ((vec (make-vector num-tags nil)))
399 (dolist (tag min-tags)
400 (aset vec tag (cons -1 'min)))
401 (dolist (tag max-tags)
402 (aset vec tag (cons -1 'max)))
406 (defun tNFA--tags-copy (tags)
407 ;; return a copy of TAGS table
408 (let* ((len (length tags))
409 (vec (make-vector len nil)))
411 (aset vec i (cons (car (aref tags i))
412 (cdr (aref tags i)))))
416 (defmacro tNFA--tags-set (tags tag val)
417 ;; set value of TAG in TAGS table to VAL
418 `(setcar (aref ,tags ,tag) ,val))
421 (defmacro tNFA--tags-get (tags tag)
422 ;; get value of TAG in TAGS table
423 `(car (aref ,tags ,tag)))
426 (defmacro tNFA--tags-type (tags tag)
427 ;; return tag type ('min or 'max)
428 `(cdr (aref ,tags ,tag)))
431 (defun tNFA--tags< (val tag tags)
432 ;; return non-nil if VAL takes precedence over the value of TAG in
433 ;; TAGS table, nil otherwise
434 (setq tag (aref tags tag))
435 (or (and (eq (cdr tag) 'min)
437 ;;(and (eq (cdr tag) 'max)
442 (defun tNFA--tags-to-groups (tags)
443 ;; Convert TAGS table to a list of indices of group matches. The n'th
444 ;; element of the list is a cons cell, whose car is the starting index
445 ;; of the nth group and whose cdr is its end index. If a group didn't
446 ;; match, the corresponding list element will be null."
447 (let ((groups (make-list (/ (length tags) 2) nil))
450 (dotimes (i (length tags))
451 (if (eq (tNFA--tags-type tags i) 'max)
452 (unless (= (tNFA--tags-get tags i) -1)
453 (setf (nth (caar group-stack) groups)
454 (cons (cdr (pop group-stack))
455 (tNFA--tags-get tags i))))
456 (unless (= (tNFA--tags-get tags i) -1)
457 (push (cons grp (tNFA--tags-get tags i)) group-stack))
464 ;;; ================================================================
467 (defun* tNFA-from-regexp (regexp &key (test 'eq))
468 "Create a tagged NFA that recognizes the regular expression REGEXP.
469 The return value is the initial state of the tagged NFA.
471 REGEXP can be any sequence type (vector, list, or string); it
472 need not be an actual string. Special characters in REGEXP are
473 still just that: elements of the sequence that are characters
474 which have a special meaning in regexps.
476 The :test keyword argument specifies how to test whether two
477 individual elements of STRING are identical. The default is `eq'.
479 Only a subset of the full Emacs regular expression syntax is
480 supported. There is no support for regexp constructs that are
481 only meaningful for strings (character ranges and character
482 classes inside character alternatives, and syntax-related
483 backslash constructs). Back-references and non-greedy postfix
484 operators are not supported, so `?' after a postfix operator
485 loses its special meaning. Also, matches are always anchored, so
486 `$' and `^' lose their special meanings (use `.*' at the
487 beginning and end of the regexp to get an unanchored match)."
489 ;; convert regexp to list, build NFA, and return initial state
490 (declare (special NFA--state-id))
491 (destructuring-bind (fragment num-tags min-tags max-tags regexp)
492 (let ((NFA--state-id -1))
493 (tNFA--from-regexp (append regexp nil) 0 '() '() 'top-level))
495 (error "Syntax error in regexp: missing \"(\"")
496 (setf (tNFA--NFA-state-type (tNFA--fragment-final fragment))
498 (tNFA--DFA-state-create-initial
499 (tNFA--epsilon-boundary
501 (tNFA--state-create-initial
502 (tNFA--fragment-initial fragment) num-tags min-tags max-tags))
508 (defmacro tNFA--regexp-postfix-p (regexp)
509 ;; return t if next token in REGEXP is a postfix operator, nil
511 `(or (eq (car ,regexp) ?*)
512 (eq (car ,regexp) ?+)
513 (eq (car ,regexp) ??)
514 (and (eq (car ,regexp) ?\\)
516 (eq (cadr ,regexp) ?{))))
519 (defun tNFA--from-regexp (regexp num-tags min-tags max-tags
520 &optional top-level shy-group)
521 ;; Construct a tagged NFA fragment from REGEXP, up to first end-group
522 ;; character or end of REGEXP. The TAGS arguments are used to pass the
523 ;; tags created so far. A non-nil TOP-LEVEL indicates that REGEXP is
524 ;; the complete regexp, so we're constructing the entire tNFA. A
525 ;; non-nil SHY-GROUP indicates that we're constructing a shy subgroup
526 ;; fragment. (Both optional arguments are only used for spotting
527 ;; syntax errors in REGEXP.)
529 ;; Returns a list: (FRAGMENT NUM-TAGS MIN-TAGS MAX-TAGS
530 ;; REGEXP). FRAGMENT is the constructed tNFA fragment, REGEXP is the
531 ;; remaining, unused portion of the regexp, and the TAGS return values
532 ;; give the tags created so far.
534 (let* ((new (tNFA--NFA-state-create))
535 (fragment-stack (list (tNFA--fragment-create new new)))
536 fragment copy attach token type group-end-tag)
540 (setq regexp (tNFA--regexp-next-token regexp)
543 regexp (nth 2 regexp))
547 ;; ----- construct new fragment -----
549 ;; syntax error: missing )
550 ((and (null type) (not top-level))
551 (error "Syntax error in regexp:\
552 extra \"(\" or missing \")\""))
554 ;; syntax error: extra )
555 ((and (eq type 'shy-group-end) top-level)
556 (error "Syntax error in regexp:\
557 extra \")\" or missing \"(\""))
559 ;; syntax error: ) ending a shy group
560 ((and (eq type 'shy-group-end) (not shy-group))
561 (error "Syntax error in regexp: \"(\" matched with \")?\""))
563 ;; syntax error: )? ending a group
564 ((and (eq type 'group-end) shy-group)
565 (error "Syntax error in regexp: \"(?\" matched with \")\""))
567 ;; syntax error: postfix operator not after atom
569 (error "Syntax error in regexp: unexpected \"%s\""
570 (char-to-string token)))
573 ;; regexp atom: construct new literal fragment
574 ((or (eq type 'literal) (eq type 'wildcard)
575 (eq type 'char-alt) (eq type 'neg-char-alt))
576 (setq new (tNFA--NFA-state-create
577 type token (tNFA--NFA-state-create))
578 fragment (tNFA--fragment-create
579 new (tNFA--NFA-state-next new))))
581 ;; shy subgroup start: recursively construct subgroup fragment
582 ((eq type 'shy-group-start)
583 (setq new (tNFA--from-regexp
584 regexp num-tags min-tags max-tags nil t)
589 fragment (nth 0 new)))
591 ;; subgroup start: add minimize tag to current fragment, and
592 ;; recursively construct subgroup fragment
593 ((eq type 'group-start)
594 (setq new (tNFA--NFA-state-create))
596 (tNFA--fragment-create
597 (tNFA--NFA-state-create-tag
598 (car (push (1- (incf num-tags)) min-tags))
601 (tNFA--fragment-patch (car fragment-stack) fragment)
602 ;; reserve next tag number for subgroup end tag
603 (setq group-end-tag num-tags)
605 ;; recursively construct subgroup fragment
606 (setq new (tNFA--from-regexp
607 regexp num-tags min-tags max-tags)
612 fragment (nth 0 new)))
615 ;; end of regexp or subgroup: ...
616 ((or (null type) (eq type 'shy-group-end) (eq type 'group-end))
618 ;; if fragment-stack contains only one fragment, throw
619 ;; fragment up to recursion level above
621 ((null (nth 1 fragment-stack))
623 (list (car fragment-stack)
624 num-tags min-tags max-tags regexp)))
626 ;; if fragment-stack contains multiple alternation fragments,
627 ;; attach them all together
631 ;; /----fragment----\
633 ;; ---o------fragment------o--->
638 ;; create a new fragment containing start and end of
641 (tNFA--fragment-create
642 (tNFA--NFA-state-create-branch)
643 (tNFA--NFA-state-create)))
644 ;; patch alternation fragments into new fragment
645 (dolist (frag fragment-stack)
646 (push (tNFA--fragment-initial frag)
647 (tNFA--NFA-state-next
648 (tNFA--fragment-initial fragment)))
649 (setf (tNFA--NFA-state-count
650 (tNFA--fragment-initial frag))
651 (incf (tNFA--NFA-state-in-degree
652 (tNFA--fragment-initial frag))))
653 (tNFA--NFA-state-make-epsilon (tNFA--fragment-final frag)
654 (tNFA--fragment-final fragment)))
655 ;; throw constructed fragment up to recursion level above
657 (list fragment num-tags min-tags max-tags regexp)))
660 ;; | alternation: start new fragment
661 ((eq type 'alternation)
662 (setq new (tNFA--NFA-state-create))
663 (push (tNFA--fragment-create new new) fragment-stack)))
666 ;; ----- attach new fragment -----
668 ;; if next token is not a postfix operator, attach new
669 ;; fragment onto end of current NFA fragment
670 (if (not (tNFA--regexp-postfix-p regexp))
671 (tNFA--fragment-patch (car fragment-stack) fragment)
673 ;; if next token is a postfix operator, splice new fragment
674 ;; into NFA as appropriate
675 (when (eq type 'alternation)
676 (error "Syntax error in regexp: unexpected \"%s\""
677 (char-to-string token)))
678 (setq regexp (tNFA--regexp-next-token regexp)
681 regexp (nth 2 regexp))
684 (setq attach (tNFA--fragment-final (car fragment-stack)))
685 (setq new (tNFA--NFA-state-create))
688 ;; * postfix = \{0,\}:
694 ;; ---attach-----new---
696 ((and (eq (car token) 0) (null (cdr token)))
697 (tNFA--NFA-state-make-branch
698 attach (list (tNFA--fragment-initial fragment) new))
699 (tNFA--NFA-state-make-epsilon
700 (tNFA--fragment-final fragment) attach)
701 (setf (tNFA--fragment-final (car fragment-stack)) new)
704 ;; + postfix = \{1,\}:
710 ;; ---fragment-----new---
712 ((and (eq (car token) 1) (null (cdr token)))
713 (tNFA--NFA-state-patch
714 attach (tNFA--fragment-initial fragment))
715 (tNFA--NFA-state-make-branch
716 (tNFA--fragment-final fragment) (list attach new))
717 (setf (tNFA--fragment-final (car fragment-stack)) new)
720 ;; \{0,n\} (note: ? postfix = \{0,1\}):
728 ;; ? postfix = \{0,1\}: after this we're done
729 (if (eq (cdr token) 1)
731 (setq copy (tNFA--fragment-copy fragment)))
733 (tNFA--NFA-state-make-branch
734 attach (list (tNFA--fragment-initial fragment) new))
735 (tNFA--NFA-state-make-epsilon
736 (tNFA--fragment-final fragment) new)
737 (setf (tNFA--fragment-final (car fragment-stack)) new)
738 ;; prepare for next iteration
740 (setq fragment copy))
742 ;; \{n,\} or \{n,m\}:
744 ;; ---attach----fragment----new---
747 (setq copy (tNFA--fragment-copy fragment))
748 (tNFA--fragment-patch (car fragment-stack) fragment)
749 ;; prepare for next iteration
751 (when (cdr token) (decf (cdr token)))
752 (if (eq (cdr token) 0)
754 (setq fragment copy)))
758 ;; if ending a group, add a maximize tag to end
760 (setq new (tNFA--NFA-state-create)
761 fragment (tNFA--fragment-create
762 (tNFA--NFA-state-create-tag
765 (push group-end-tag max-tags)
766 (tNFA--fragment-patch (car fragment-stack) fragment)))
767 )) ; end of infinite loop and catch
772 ;; Note: hard-coding the parsing like this is ugly, though sufficient
773 ;; for our purposes. Perhaps it would be more elegant to implement
774 ;; this in terms of a proper parser...
776 (defun tNFA--regexp-next-token (regexp)
777 ;; if regexp is empty, return null values for next token type, token
778 ;; and remaining regexp
782 (let ((token (pop regexp))
783 (type 'literal)) ; assume token is literal initially
786 ;; [: gobble up to closing ]
788 ;; character alternatives are stored in lists
791 ;; gobble ] appearing straight after [
792 ((eq (car regexp) ?\]) (push (pop regexp) token))
793 ;; gobble ] appearing straight after [^
794 ((and (eq (car regexp) ?^) (eq (nth 1 regexp) ?\]))
795 (push (pop regexp) token)
796 (push (pop regexp) token)))
797 ;; gobble everything up to closing ]
798 (while (not (eq (car regexp) ?\]))
799 (push (pop regexp) token)
801 (error "Syntax error in regexp: missing \"]\"")))
802 (pop regexp) ; dump closing ]
803 (if (not (eq (car (last token)) ?^))
804 (setq type 'char-alt)
805 (setq type 'neg-char-alt)
806 (setq token (butlast token))))
808 ;; ]: syntax error (always gobbled when parsing [)
810 (error "Syntax error in regexp: missing \"[\""))
812 ;; . * + ?: set appropriate type
813 ((eq token ?*) (setq type 'postfix token (cons 0 nil)))
814 ((eq token ?+) (setq type 'postfix token (cons 1 nil)))
815 ((eq token ??) (setq type 'postfix token (cons 0 1)))
816 ((eq token ?.) (setq type 'wildcard))
818 ;; \: look at next character
820 (unless (setq token (pop regexp))
821 (error "Syntax error in regexp:\
822 missing character after \"\\\""))
825 ((eq token ?|) (setq type 'alternation))
826 ;; \(?: shy group start
827 ((and (eq token ?\() (eq (car regexp) ??))
828 (setq type 'shy-group-start)
830 ;; \)?: shy group end
831 ((and (eq token ?\)) (eq (car regexp) ??))
832 (setq type 'shy-group-end)
835 ((eq token ?\() (setq type 'group-start))
837 ((eq token ?\)) (setq type 'group-end))
839 ;; \{: postfix repetition operator
841 (setq type 'postfix token (cons nil nil))
842 ;; extract first number from repetition operator
843 (while (if (null regexp)
844 (error "Syntax error in regexp:\
845 malformed \\{...\\}")
846 (not (or (eq (car regexp) ?,)
847 (eq (car regexp) ?\\))))
849 (concat (car token) (char-to-string (pop regexp)))))
850 (if (null (car token))
852 (unless (string-match "[0-9]+" (car token))
853 (error "Syntax error in regexp: malformed \\{...\\}"))
854 (setcar token (string-to-number (car token))))
856 ;; if next character is "\", we expect "}" to follow
857 ((eq (car regexp) ?\\)
859 (unless (eq (car regexp) ?})
860 (error "Syntax error in regexp: expected \"}\""))
863 (error "Syntax error in regexp: malformed \\{...\\}"))
864 (setcdr token (car token)))
865 ;; if next character is ",", we expect a second number to
867 ((eq (car regexp) ?,)
869 (while (if (null regexp)
870 (error "Syntax error in regexp:\
871 malformed \\{...\\}")
872 (not (eq (car regexp) ?\\)))
875 (char-to-string (pop regexp)))))
876 (unless (null (cdr token))
877 (unless (string-match "[0-9]+" (cdr token))
878 (error "Syntax error in regexp: malformed \\{...\\}"))
879 (setcdr token (string-to-number (cdr token))))
881 (unless (eq (car regexp) ?})
882 (error "Syntax error in regexp: expected \"}\""))
887 ;; return first token type, token, and remaining regexp
888 (list type token regexp))))
892 ;;; ================================================================
895 (defun tNFA-next-state (tNFA chr pos)
896 "Evolve tNFA according to CHR, which corresponds to position
899 ;; if there is a transition for character CHR...
901 ((setq elem (tNFA--assoc chr (tNFA--DFA-state-transitions tNFA)
902 :test (tNFA--DFA-state-test tNFA)))
903 ;; if next state has not already been computed, do so
904 (unless (tNFA--DFA-state-p (setq state (cdr elem)))
905 (setq state (tNFA--DFA-next-state tNFA chr pos nil))
906 (setcdr elem state)))
908 ;; if there's a wildcard transition...
909 ((setq state (tNFA--DFA-state-wildcard tNFA))
910 ;; if next state has not already been computed, do so
911 (unless (tNFA--DFA-state-p state)
912 (setq state (tNFA--DFA-next-state tNFA chr pos t))
913 (setf (tNFA--DFA-state-wildcard tNFA) state))))
918 (defun tNFA--DFA-next-state (DFA-state chr pos wildcard)
919 (let (state-list state)
920 ;; add all states reached by a CHR transition from DFA-STATE to
923 (dolist (state (tNFA--DFA-state-list DFA-state))
924 (when (or (eq (tNFA--state-type state) 'wildcard)
925 (and (eq (tNFA--state-type state) 'neg-char-alt)
926 (not (memq chr (tNFA--state-label state)))))
927 (push (tNFA--state-create
928 (tNFA--state-next state)
929 (tNFA--tags-copy (tNFA--state-tags state)))
931 (dolist (state (tNFA--DFA-state-list DFA-state))
932 (when (or (and (eq (tNFA--state-type state) 'literal)
933 (eq chr (tNFA--state-label state)))
934 (and (eq (tNFA--state-type state) 'char-alt)
935 (memq chr (tNFA--state-label state)))
936 (and (eq (tNFA--state-type state) 'neg-char-alt)
937 (not (memq chr (tNFA--state-label state))))
938 (eq (tNFA--state-type state) 'wildcard))
939 (push (tNFA--state-create
940 (tNFA--state-next state)
941 (tNFA--tags-copy (tNFA--state-tags state)))
944 ;; if state list is empty, return empty, failure DFA state
946 ;; otherwise, construct new DFA state and add it to the pool if
947 ;; it's not already there
948 (setq state-list (tNFA--epsilon-boundary state-list (1+ pos)))
950 (or (gethash state-list (tNFA--DFA-state-pool DFA-state))
951 (tNFA--DFA-state-create
953 (tNFA--DFA-state-pool DFA-state)
954 :test (tNFA--DFA-state-test DFA-state))))
960 (defun tNFA--epsilon-boundary (state-set pos)
961 ;; Return the tagged epsilon-boundary of the NFA states listed in
962 ;; STATE-SET, that is the set of all states that can be reached via
963 ;; epsilon transitions from some state in STATE-SET (not including
964 ;; states in STATE-SET itself).
965 (let ((queue (queue-create))
969 ;; temporarily link the NFA states to their corresponding tNFA
970 ;; states, and add them to the queue
971 (dolist (t-state state-set)
972 (setf state (tNFA--state-NFA-state t-state)
973 (tNFA--NFA-state-tNFA-state state) t-state)
975 (queue-enqueue queue state))
977 (while (setq state (queue-dequeue queue))
979 ;; branch or epsilon: add next states as necessary, copying tags
981 ((or (eq (tNFA--NFA-state-type state) 'branch)
982 (eq (tNFA--NFA-state-type state) 'epsilon))
983 (dolist (next (if (eq (tNFA--NFA-state-type state) 'epsilon)
984 (list (tNFA--NFA-state-next state))
985 (tNFA--NFA-state-next state)))
986 (unless (tNFA--NFA-state-tNFA-state next)
987 (setf (tNFA--NFA-state-tNFA-state next)
989 next (tNFA--tags-copy (tNFA--NFA-state-tags state))))
991 ;; if next state hasn't already been seen in-degree times,
992 ;; add it to the end of the queue
993 (if (/= (decf (tNFA--NFA-state-count next)) 0)
994 (queue-enqueue queue next)
995 ;; if it has now been seen in-degree times, reset count
996 ;; and add it back to the front of the queue
997 (setf (tNFA--NFA-state-count next)
998 (tNFA--NFA-state-in-degree next))
999 (queue-prepend queue next)))))
1001 ;; tag: add next state if necessary, updating tags if necessary
1002 ((eq (tNFA--NFA-state-type state) 'tag)
1003 (setq next (tNFA--NFA-state-next state))
1004 ;; if next state is not already in results list, or it is
1005 ;; already in results but new tag value takes precedence...
1006 (when (or (not (tNFA--NFA-state-tNFA-state next))
1007 (tNFA--tags< pos (tNFA--NFA-state-tag state)
1008 (tNFA--NFA-state-tags next)))
1009 ;; if next state is already in results, update tag value
1010 (if (tNFA--NFA-state-tNFA-state next)
1011 (tNFA--tags-set (tNFA--NFA-state-tags next)
1012 (tNFA--NFA-state-tag state) pos)
1013 ;; if state is not already in results, copy tags, updating
1014 ;; tag value, and add next state to results list
1015 (setq tags (tNFA--tags-copy (tNFA--NFA-state-tags state)))
1016 (tNFA--tags-set tags (tNFA--NFA-state-tag state) pos)
1017 (setf (tNFA--NFA-state-tNFA-state next)
1018 (tNFA--state-create next tags))
1020 ;; if next state hasn't already been seen in-degree times, add
1021 ;; it to the end of the queue
1022 (if (/= (decf (tNFA--NFA-state-count next)) 0)
1023 (queue-enqueue queue next)
1024 ;; if it has now been seen in-degree times, reset count and
1025 ;; add it back to the front of the queue
1026 (setf (tNFA--NFA-state-count next)
1027 (tNFA--NFA-state-in-degree next))
1028 (queue-prepend queue next))))
1030 ;; anything else is a non-epsilon-transition state, so add it to
1032 (t (push (tNFA--NFA-state-tNFA-state state) result))
1035 ;; reset temporary NFA state link and count
1036 (dolist (state reset)
1037 (setf (tNFA--NFA-state-tNFA-state state) nil
1038 (tNFA--NFA-state-count state)
1039 (tNFA--NFA-state-in-degree state)))
1040 ;; sort result states
1042 (lambda (a b) (< (tNFA--state-id a) (tNFA--state-id b))))
1047 ;;; ================================================================
1050 (defun* tNFA-regexp-match (regexp string &key (test 'eq))
1051 "Return non-nil if STRING matches REGEXP, nil otherwise.
1052 Sets the match data if there was a match; see `match-beginning',
1053 `match-end' and `match-string'.
1055 REGEXP and STRING can be any sequence type (vector, list, or
1056 string); they need not be actual strings. Special characters in
1057 REGEXP are still just that: elements of the sequence that are
1058 characters which have a special meaning in regexps.
1060 The :test keyword argument specifies how to test whether two
1061 individual elements of STRING are identical. The default is `eq'.
1063 Only a subset of the full Emacs regular expression syntax is
1064 supported. There is no support for regexp constructs that are
1065 only meaningful for strings (character ranges and character
1066 classes inside character alternatives, and syntax-related
1067 backslash constructs). Back-references and non-greedy postfix
1068 operators are not supported, so `?' after a postfix operator
1069 loses its special meaning. Also, matches are always anchored, so
1070 `$' and `^' lose their special meanings (use `.*' at the
1071 beginning and end of the regexp to get an unanchored match)."
1073 (let ((tNFA (tNFA-from-regexp regexp :test test))
1074 (i -1) tags match-data group-stack (grp 0))
1076 ;; evolve tNFA according to characters of STRING
1078 (dolist (chr (append string nil))
1079 (unless (setq tNFA (tNFA-next-state tNFA chr (incf i)))
1082 ;; if REGEXP matched...
1083 (when (setq tags (tNFA--DFA-state-match tNFA))
1084 (setq match-data (make-list (+ (length tags) 2) nil))
1086 (setf (nth 0 match-data) 0
1087 (nth 1 match-data) (length string))
1088 ;; set group match data if there were any groups
1089 (dotimes (i (length tags))
1090 (if (eq (tNFA--tags-type tags i) 'max)
1091 (unless (= (tNFA--tags-get tags i) -1)
1092 (setf (nth (1+ (* 2 (pop group-stack))) match-data)
1093 (tNFA--tags-get tags i)))
1095 (unless (= (tNFA--tags-get tags i) -1)
1096 (push grp group-stack)
1097 (setf (nth (* 2 grp) match-data)
1098 (tNFA--tags-get tags i)))))
1099 (set-match-data match-data)
1103 (defun tNFA-group-data (tNFA)
1104 "Return the group match data associated with a tNFA state."
1105 (tNFA--tags-to-groups (tNFA--DFA-state-match tNFA)))
1111 ;;; tNFA.el ends here