]> code.delx.au - gnu-emacs-elpa/blob - packages/tNFA/tNFA.el
Merge commit '4709fc4530da4ddfd29b910763c801292b228f69' from diff-hl
[gnu-emacs-elpa] / packages / tNFA / tNFA.el
1 ;;; tNFA.el --- Tagged non-deterministic finite-state automata
2
3 ;; Copyright (C) 2008-2010, 2012 Free Software Foundation, Inc
4
5 ;; Author: Toby Cubitt <toby-predictive@dr-qubit.org>
6 ;; Version: 0.1.1
7 ;; Keywords: extensions, matching, data structures
8 ;; tNFA, NFA, DFA, finite state automata, automata, regexp
9 ;; Package-Requires: ((queue "0.1"))
10 ;; URL: http://www.dr-qubit.org/emacs.php
11 ;; Repository: http://www.dr-qubit.org/git/predictive.git
12
13 ;; This file is part of Emacs.
14 ;;
15 ;; GNU Emacs is free software: you can redistribute it and/or modify it under
16 ;; the terms of the GNU General Public License as published by the Free
17 ;; Software Foundation, either version 3 of the License, or (at your option)
18 ;; any later version.
19 ;;
20 ;; GNU Emacs is distributed in the hope that it will be useful, but WITHOUT
21 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22 ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
23 ;; more details.
24 ;;
25 ;; You should have received a copy of the GNU General Public License along
26 ;; with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
27
28
29 ;;; Commentary:
30 ;;
31 ;; A tagged, non-deterministic finite state automata (NFA) is an abstract
32 ;; computing machine that recognises regular languages. In layman's terms,
33 ;; they are used to decide whether a string matches a regular expression. The
34 ;; "tagged" part allows the NFA to do group-capture: it returns information
35 ;; about which parts of a string matched which subgroup of the regular
36 ;; expression.
37 ;;
38 ;; Why re-implement regular expression matching when Emacs comes with
39 ;; extensive built-in support for regexps? Primarily, because some algorithms
40 ;; require access to the NFA states produced part way through the regular
41 ;; expression matching process (see the trie.el package for an
42 ;; example). Secondarily, because Emacs regexps only work on strings, whereas
43 ;; regular expressions can usefully be used in Elisp code to match other
44 ;; sequence types, not just strings.
45 ;;
46 ;; A tagged NFA can be created from a regular expression using
47 ;; `tNFA-from-regexp', and its state can be updated using
48 ;; `tNFA-next-state'. You can discover whether a state is a matching state
49 ;; using `tNFA-match-p', extract subgroup capture data from it using
50 ;; `tNFA-group-data', check whether a state has any wildcard transitions using
51 ;; `tNFA-wildcard-p', and get a list of non-wildcard transitions using
52 ;; `tNFA-transitions'. Finally, `tNFA-regexp-match' uses tagged NFAs to decide
53 ;; whether a regexp matches a given string.
54 ;;
55 ;; Note that Emacs' regexps are not regular expressions in the original
56 ;; meaning of that phrase. Emacs regexps implement additional features (in
57 ;; particular, back-references) that allow them to match far more than just
58 ;; regular languages. This comes at a cost: regexp matching can potentially be
59 ;; very slow (NP-hard in fact, though the hard cases rarely crop up in
60 ;; practise), whereas there are efficient (polynomial-time) algorithms for
61 ;; matching regular expressions (in the original sense). Therefore, this
62 ;; package only supports a subset of the full Emacs regular expression
63 ;; syntax. See the function docstrings for more information.
64 ;;
65 ;; This package essentially implements Laurikari's algorithm, as described in
66 ;; his master's thesis, but it builds the corresponding tagged deterministic
67 ;; finite state automaton (DFA) on-the-fly as needed.
68 ;;
69 ;; This package uses the queue package queue.el.
70
71
72 ;;; Change Log:
73 ;;
74 ;; Version 0.1.1
75 ;; * work-around mysterious byte-compiler bug by defining
76 ;; `tNFA--NFA-state-create' and `tNFA--NFA-state-create-tag' via `defun'
77 ;; instead of directly in `defstruct'
78 ;;
79 ;; Version 0.1
80 ;; * initial version
81
82
83
84 ;;; Code:
85
86 (eval-when-compile (require 'cl))
87 (require 'queue)
88
89
90
91 ;;; ================================================================
92 ;;; Replcements for CL functions
93
94 (defun* tNFA--assoc (item alist &key (test 'eq))
95 ;; Return first cons cell in ALIST whose CAR matches ITEM according to
96 ;; :test function (defaulting to `eq')
97 (while (and alist
98 (or (not (consp (car alist)))
99 (not (funcall test item (caar alist)))))
100 (setq alist (cdr alist)))
101 (car alist))
102
103
104
105 ;;; ================================================================
106 ;;; Data structures
107
108 ;;; ----------------------------------------------------------------
109 ;;; tagged NFA states
110
111 (defstruct
112 (tNFA--state
113 (:constructor nil)
114 (:constructor tNFA--state-create-initial
115 (NFA-state num-tags min-tags max-tags
116 &aux
117 (tags (tNFA--tags-create num-tags min-tags max-tags))))
118 (:constructor tNFA--state-create (NFA-state tags))
119 (:copier nil))
120 NFA-state tags)
121
122 (defmacro tNFA--state-id (state)
123 `(tNFA--NFA-state-id (tNFA--state-NFA-state ,state)))
124
125 (defmacro tNFA--state-type (state)
126 `(tNFA--NFA-state-type (tNFA--state-NFA-state ,state)))
127
128 (defmacro tNFA--state-label (state)
129 `(tNFA--NFA-state-label (tNFA--state-NFA-state ,state)))
130
131 (defmacro tNFA--state-in-degree (state)
132 `(tNFA--NFA-state-in-degree (tNFA--state-NFA-state ,state)))
133
134 (defmacro tNFA--state-next (state)
135 `(tNFA--NFA-state-next (tNFA--state-NFA-state ,state)))
136
137 (defmacro tNFA--state-count (state)
138 `(tNFA--NFA-state-count (tNFA--state-NFA-state ,state)))
139
140
141
142 ;;; ----------------------------------------------------------------
143 ;;; NFA states
144
145 (declare (special NFA--state-id))
146
147 (defstruct
148 (tNFA--NFA-state
149 (:type vector)
150 (:constructor nil)
151 (:constructor tNFA---NFA-state-create
152 (&optional type label next
153 &aux
154 (in-degree 0)
155 (count 0)
156 (id (incf NFA--state-id))
157 ;; (dummy
158 ;; (when next
159 ;; (setf (tNFA--NFA-state-count next)
160 ;; (incf (tNFA--NFA-state-in-degree next)))))
161 ))
162 (:constructor tNFA--NFA-state-create-branch
163 (&rest next
164 &aux
165 (type 'branch)
166 (in-degree 0)
167 (count 0)
168 (id (incf NFA--state-id))))
169 (:constructor tNFA---NFA-state-create-tag
170 (tag &optional next
171 &aux
172 (type 'tag)
173 (label tag)
174 (in-degree 0)
175 (count 0)
176 (id (incf NFA--state-id))
177 ;; (dummy
178 ;; (when next
179 ;; (setf (tNFA--NFA-state-count next)
180 ;; (incf (tNFA--NFA-state-in-degree next)))))
181 ))
182 (:copier nil))
183 id type label in-degree
184 count tNFA-state ; used internally in NFA evolution algorithms
185 next)
186
187
188 ;; Define these via defun instead of using the dummy argument in the
189 ;; above defstruct to work around a mysterious byte-compiler bug.
190
191 (defun tNFA--NFA-state-create (&optional type label next)
192 (when next
193 (setf (tNFA--NFA-state-count next)
194 (incf (tNFA--NFA-state-in-degree next))))
195 (tNFA---NFA-state-create type label next))
196
197 (defun tNFA--NFA-state-create-tag (tag &optional next)
198 (when next
199 (setf (tNFA--NFA-state-count next)
200 (incf (tNFA--NFA-state-in-degree next))))
201 (tNFA---NFA-state-create-tag tag next))
202
203
204 ;; tag number for a tagged epsilon transition is stored in label slot
205 (defalias 'tNFA--NFA-state-tag 'tNFA--NFA-state-label)
206
207 (defmacro tNFA--NFA-state-tags (state)
208 `(tNFA--state-tags (tNFA--NFA-state-tNFA-state ,state)))
209
210
211 (defun tNFA--NFA-state-patch (attach state)
212 ;; patch STATE onto ATTACH. Return value is meaningless
213 (setf
214 (tNFA--NFA-state-type attach)
215 (tNFA--NFA-state-type state)
216 (tNFA--NFA-state-label attach)
217 (tNFA--NFA-state-label state)
218 (tNFA--NFA-state-next attach)
219 (tNFA--NFA-state-next state)
220 (tNFA--NFA-state-count state)
221 (incf (tNFA--NFA-state-in-degree state))))
222
223
224 (defun tNFA--NFA-state-make-epsilon (state next)
225 ;; create an epsilon transition from STATE to NEXT
226 (setf
227 (tNFA--NFA-state-type state) 'epsilon
228 (tNFA--NFA-state-label state) nil
229 (tNFA--NFA-state-next state) next
230 (tNFA--NFA-state-count next)
231 (incf (tNFA--NFA-state-in-degree next))))
232
233
234 (defun tNFA--NFA-state-make-branch (state next)
235 ;; create a branch from STATE to all states in NEXT list
236 (setf (tNFA--NFA-state-type state) 'branch
237 (tNFA--NFA-state-label state) nil
238 (tNFA--NFA-state-next state) next)
239 (dolist (n next)
240 (setf (tNFA--NFA-state-count n)
241 (incf (tNFA--NFA-state-in-degree n)))))
242
243
244 (defun tNFA--NFA-state-copy (state)
245 ;; Return a copy of STATE. The next link is *not* copied, it is `eq'
246 ;; to the original next link. Use `tNFA--fragment-copy' if you want to
247 ;; recursively copy a chain of states. Note: NFA--state-id must be
248 ;; bound to something appropriate when this function is called.
249 (let ((copy (copy-sequence state)))
250 (setf (tNFA--NFA-state-id copy) (incf NFA--state-id))
251 copy))
252
253
254
255 ;;; ----------------------------------------------------------------
256 ;;; NFA fragments
257
258 (defstruct
259 (tNFA--fragment
260 (:type vector)
261 (:constructor nil)
262 (:constructor tNFA--fragment-create (initial final))
263 (:copier nil))
264 initial final)
265
266
267 (defun tNFA--fragment-patch (frag1 frag2)
268 ;; patch FRAG2 onto end of FRAG1; return value is meaningless
269 (tNFA--NFA-state-patch (tNFA--fragment-final frag1)
270 (tNFA--fragment-initial frag2))
271 (setf (tNFA--fragment-final frag1) (tNFA--fragment-final frag2)))
272
273
274 (defun tNFA--fragment-copy (fragment)
275 ;; return a copy of FRAGMENT.
276 (declare (special copied-states))
277 (let (copied-states)
278 (tNFA--fragment-create
279 (tNFA--do-fragment-copy (tNFA--fragment-initial fragment))
280 (cdr (assq (tNFA--fragment-final fragment) copied-states)))))
281
282
283 (defun tNFA--do-fragment-copy (state)
284 ;; return a copy of STATE, recursively following and copying links
285 ;; (note: NFA--state-id must be bound to something appropriate when
286 ;; this is called)
287 (declare (special copied-states))
288 (let ((copy (tNFA--NFA-state-copy state)))
289 (push (cons state copy) copied-states)
290
291 ;; if STATE is a branch, copy all links
292 (cond
293 ((eq (tNFA--NFA-state-type copy) 'branch)
294 (setf (tNFA--NFA-state-next copy)
295 (mapcar (lambda (next)
296 (or (cdr (assq next copied-states))
297 (tNFA--do-fragment-copy next)))
298 (tNFA--NFA-state-next copy))))
299
300 ;; if state doesn't have a next link, return
301 ((or (eq (tNFA--NFA-state-type copy) 'match)
302 (null (tNFA--NFA-state-type copy))))
303
304 ;; otherwise, copy next link
305 ((tNFA--NFA-state-type copy)
306 ;; for a non-branch STATE, copy next link
307 (setf (tNFA--NFA-state-next copy)
308 ;; if we've already copied next state, set next link to that
309 (or (cdr (assq (tNFA--NFA-state-next copy) copied-states))
310 ;; otherwise, recursively copy next state
311 (tNFA--do-fragment-copy (tNFA--NFA-state-next copy))))))
312 copy))
313
314
315
316 ;;; ----------------------------------------------------------------
317 ;;; DFA states
318
319 (defstruct
320 (tNFA--DFA-state
321 :named
322 (:constructor nil)
323 (:constructor tNFA--DFA-state--create
324 (list pool
325 &key
326 (test 'eq)
327 &aux
328 (transitions ())))
329 (:copier nil))
330 list transitions test wildcard match pool)
331
332
333 (defun* tNFA--DFA-state-create (state-list state-pool &key (test 'eq))
334 ;; create DFA state and add it to the state pool
335 (let ((DFA-state (tNFA--DFA-state--create
336 state-list state-pool :test test))
337 tmp-list)
338 (puthash state-list DFA-state (tNFA--DFA-state-pool DFA-state))
339
340 (dolist (state state-list)
341 ;; if state in state list is...
342 (cond
343 ;; literal state: add literal transition
344 ((eq (tNFA--state-type state) 'literal)
345 (setq tmp-list (tNFA--DFA-state-transitions DFA-state))
346 (add-to-list 'tmp-list (cons (tNFA--state-label state) t))
347 (setf (tNFA--DFA-state-transitions DFA-state) tmp-list))
348
349 ;; character alternative: add transitions for all alternatives
350 ((eq (tNFA--state-type state) 'char-alt)
351 (dolist (c (tNFA--state-label state))
352 (setq tmp-list (tNFA--DFA-state-transitions DFA-state))
353 (add-to-list 'tmp-list (cons c t))
354 (setf (tNFA--DFA-state-transitions DFA-state) tmp-list)))
355
356 ;; wildcard or negated character alternative: add wildcard
357 ;; transistion
358 ((or (eq (tNFA--state-type state) 'wildcard)
359 (eq (tNFA--state-type state) 'neg-char-alt))
360 (setf (tNFA--DFA-state-wildcard DFA-state) t))
361
362 ;; match state: set match tags
363 ((eq (tNFA--state-type state) 'match)
364 (setf (tNFA--DFA-state-match DFA-state)
365 (tNFA--state-tags state)))))
366
367 ;; return constructed state
368 DFA-state))
369
370
371 (defun* tNFA--DFA-state-create-initial (state-list &key (test 'eq))
372 ;; create initial DFA state from initial tNFA state INITIAL-STATE
373 (tNFA--DFA-state-create state-list
374 (make-hash-table :test 'equal)
375 :test test))
376
377
378 (defalias 'tNFA-match-p 'tNFA--DFA-state-match
379 "Return non-nil if STATE is a matching state, otherwise return nil.")
380
381
382 (defalias 'tNFA-wildcard-p 'tNFA--DFA-state-wildcard
383 "Return non-nil if STATE has a wildcard transition,
384 otherwise return nil.")
385
386
387 (defun tNFA-transitions (state)
388 "Return list of literal transitions from tNFA state STATE."
389 (mapcar 'car (tNFA--DFA-state-transitions state)))
390
391
392
393 ;;; ----------------------------------------------------------------
394 ;;; tag tables
395
396 (defun tNFA--tags-create (num-tags min-tags max-tags)
397 ;; construct a new tags table
398 (let ((vec (make-vector num-tags nil)))
399 (dolist (tag min-tags)
400 (aset vec tag (cons -1 'min)))
401 (dolist (tag max-tags)
402 (aset vec tag (cons -1 'max)))
403 vec))
404
405
406 (defun tNFA--tags-copy (tags)
407 ;; return a copy of TAGS table
408 (let* ((len (length tags))
409 (vec (make-vector len nil)))
410 (dotimes (i len)
411 (aset vec i (cons (car (aref tags i))
412 (cdr (aref tags i)))))
413 vec))
414
415
416 (defmacro tNFA--tags-set (tags tag val)
417 ;; set value of TAG in TAGS table to VAL
418 `(setcar (aref ,tags ,tag) ,val))
419
420
421 (defmacro tNFA--tags-get (tags tag)
422 ;; get value of TAG in TAGS table
423 `(car (aref ,tags ,tag)))
424
425
426 (defmacro tNFA--tags-type (tags tag)
427 ;; return tag type ('min or 'max)
428 `(cdr (aref ,tags ,tag)))
429
430
431 (defun tNFA--tags< (val tag tags)
432 ;; return non-nil if VAL takes precedence over the value of TAG in
433 ;; TAGS table, nil otherwise
434 (setq tag (aref tags tag))
435 (or (and (eq (cdr tag) 'min)
436 (< val (car tag)))
437 ;;(and (eq (cdr tag) 'max)
438 (> val (car tag));)
439 ))
440
441
442 (defun tNFA--tags-to-groups (tags)
443 ;; Convert TAGS table to a list of indices of group matches. The n'th
444 ;; element of the list is a cons cell, whose car is the starting index
445 ;; of the nth group and whose cdr is its end index. If a group didn't
446 ;; match, the corresponding list element will be null."
447 (let ((groups (make-list (/ (length tags) 2) nil))
448 group-stack
449 (grp 0))
450 (dotimes (i (length tags))
451 (if (eq (tNFA--tags-type tags i) 'max)
452 (unless (= (tNFA--tags-get tags i) -1)
453 (setf (nth (caar group-stack) groups)
454 (cons (cdr (pop group-stack))
455 (tNFA--tags-get tags i))))
456 (unless (= (tNFA--tags-get tags i) -1)
457 (push (cons grp (tNFA--tags-get tags i)) group-stack))
458 (incf grp)))
459 groups))
460
461
462
463
464 ;;; ================================================================
465 ;;; Regexp -> tNFA
466
467 ;;;###autoload
468 (defun* tNFA-from-regexp (regexp &key (test 'eq))
469 "Create a tagged NFA that recognizes the regular expression REGEXP.
470 The return value is the initial state of the tagged NFA.
471
472 REGEXP can be any sequence type (vector, list, or string); it
473 need not be an actual string. Special characters in REGEXP are
474 still just that: elements of the sequence that are characters
475 which have a special meaning in regexps.
476
477 The :test keyword argument specifies how to test whether two
478 individual elements of STRING are identical. The default is `eq'.
479
480 Only a subset of the full Emacs regular expression syntax is
481 supported. There is no support for regexp constructs that are
482 only meaningful for strings (character ranges and character
483 classes inside character alternatives, and syntax-related
484 backslash constructs). Back-references and non-greedy postfix
485 operators are not supported, so `?' after a postfix operator
486 loses its special meaning. Also, matches are always anchored, so
487 `$' and `^' lose their special meanings (use `.*' at the
488 beginning and end of the regexp to get an unanchored match)."
489
490 ;; convert regexp to list, build NFA, and return initial state
491 (declare (special NFA--state-id))
492 (destructuring-bind (fragment num-tags min-tags max-tags regexp)
493 (let ((NFA--state-id -1))
494 (tNFA--from-regexp (append regexp nil) 0 '() '() 'top-level))
495 (if regexp
496 (error "Syntax error in regexp: missing \"(\"")
497 (setf (tNFA--NFA-state-type (tNFA--fragment-final fragment))
498 'match)
499 (tNFA--DFA-state-create-initial
500 (tNFA--epsilon-boundary
501 (list
502 (tNFA--state-create-initial
503 (tNFA--fragment-initial fragment) num-tags min-tags max-tags))
504 0)
505 :test test)
506 )))
507
508
509 (defmacro tNFA--regexp-postfix-p (regexp)
510 ;; return t if next token in REGEXP is a postfix operator, nil
511 ;; otherwise
512 `(or (eq (car ,regexp) ?*)
513 (eq (car ,regexp) ?+)
514 (eq (car ,regexp) ??)
515 (and (eq (car ,regexp) ?\\)
516 (cdr ,regexp)
517 (eq (cadr ,regexp) ?{))))
518
519
520 (defun tNFA--from-regexp (regexp num-tags min-tags max-tags
521 &optional top-level shy-group)
522 ;; Construct a tagged NFA fragment from REGEXP, up to first end-group
523 ;; character or end of REGEXP. The TAGS arguments are used to pass the
524 ;; tags created so far. A non-nil TOP-LEVEL indicates that REGEXP is
525 ;; the complete regexp, so we're constructing the entire tNFA. A
526 ;; non-nil SHY-GROUP indicates that we're constructing a shy subgroup
527 ;; fragment. (Both optional arguments are only used for spotting
528 ;; syntax errors in REGEXP.)
529 ;;
530 ;; Returns a list: (FRAGMENT NUM-TAGS MIN-TAGS MAX-TAGS
531 ;; REGEXP). FRAGMENT is the constructed tNFA fragment, REGEXP is the
532 ;; remaining, unused portion of the regexp, and the TAGS return values
533 ;; give the tags created so far.
534
535 (let* ((new (tNFA--NFA-state-create))
536 (fragment-stack (list (tNFA--fragment-create new new)))
537 fragment copy attach token type group-end-tag)
538
539 (catch 'constructed
540 (while t
541 (setq regexp (tNFA--regexp-next-token regexp)
542 type (nth 0 regexp)
543 token (nth 1 regexp)
544 regexp (nth 2 regexp))
545 (setq fragment nil
546 group-end-tag nil)
547
548 ;; ----- construct new fragment -----
549 (cond
550 ;; syntax error: missing )
551 ((and (null type) (not top-level))
552 (error "Syntax error in regexp:\
553 extra \"(\" or missing \")\""))
554
555 ;; syntax error: extra )
556 ((and (eq type 'shy-group-end) top-level)
557 (error "Syntax error in regexp:\
558 extra \")\" or missing \"(\""))
559
560 ;; syntax error: ) ending a shy group
561 ((and (eq type 'shy-group-end) (not shy-group))
562 (error "Syntax error in regexp: \"(\" matched with \")?\""))
563
564 ;; syntax error: )? ending a group
565 ((and (eq type 'group-end) shy-group)
566 (error "Syntax error in regexp: \"(?\" matched with \")\""))
567
568 ;; syntax error: postfix operator not after atom
569 ((eq type 'postfix)
570 (error "Syntax error in regexp: unexpected \"%s\""
571 (char-to-string token)))
572
573
574 ;; regexp atom: construct new literal fragment
575 ((or (eq type 'literal) (eq type 'wildcard)
576 (eq type 'char-alt) (eq type 'neg-char-alt))
577 (setq new (tNFA--NFA-state-create
578 type token (tNFA--NFA-state-create))
579 fragment (tNFA--fragment-create
580 new (tNFA--NFA-state-next new))))
581
582 ;; shy subgroup start: recursively construct subgroup fragment
583 ((eq type 'shy-group-start)
584 (setq new (tNFA--from-regexp
585 regexp num-tags min-tags max-tags nil t)
586 num-tags (nth 1 new)
587 min-tags (nth 2 new)
588 max-tags (nth 3 new)
589 regexp (nth 4 new)
590 fragment (nth 0 new)))
591
592 ;; subgroup start: add minimize tag to current fragment, and
593 ;; recursively construct subgroup fragment
594 ((eq type 'group-start)
595 (setq new (tNFA--NFA-state-create))
596 (setq fragment
597 (tNFA--fragment-create
598 (tNFA--NFA-state-create-tag
599 (car (push (1- (incf num-tags)) min-tags))
600 new)
601 new))
602 (tNFA--fragment-patch (car fragment-stack) fragment)
603 ;; reserve next tag number for subgroup end tag
604 (setq group-end-tag num-tags)
605 (incf num-tags)
606 ;; recursively construct subgroup fragment
607 (setq new (tNFA--from-regexp
608 regexp num-tags min-tags max-tags)
609 num-tags (nth 1 new)
610 min-tags (nth 2 new)
611 max-tags (nth 3 new)
612 regexp (nth 4 new)
613 fragment (nth 0 new)))
614
615
616 ;; end of regexp or subgroup: ...
617 ((or (null type) (eq type 'shy-group-end) (eq type 'group-end))
618
619 ;; if fragment-stack contains only one fragment, throw
620 ;; fragment up to recursion level above
621 (cond
622 ((null (nth 1 fragment-stack))
623 (throw 'constructed
624 (list (car fragment-stack)
625 num-tags min-tags max-tags regexp)))
626
627 ;; if fragment-stack contains multiple alternation fragments,
628 ;; attach them all together
629 ;;
630 ;; .--fragment--.
631 ;; / \
632 ;; /----fragment----\
633 ;; / \
634 ;; ---o------fragment------o--->
635 ;; \ . /
636 ;; \ . /
637 ;; .
638 (t
639 ;; create a new fragment containing start and end of
640 ;; alternation
641 (setq fragment
642 (tNFA--fragment-create
643 (tNFA--NFA-state-create-branch)
644 (tNFA--NFA-state-create)))
645 ;; patch alternation fragments into new fragment
646 (dolist (frag fragment-stack)
647 (push (tNFA--fragment-initial frag)
648 (tNFA--NFA-state-next
649 (tNFA--fragment-initial fragment)))
650 (setf (tNFA--NFA-state-count
651 (tNFA--fragment-initial frag))
652 (incf (tNFA--NFA-state-in-degree
653 (tNFA--fragment-initial frag))))
654 (tNFA--NFA-state-make-epsilon (tNFA--fragment-final frag)
655 (tNFA--fragment-final fragment)))
656 ;; throw constructed fragment up to recursion level above
657 (throw 'constructed
658 (list fragment num-tags min-tags max-tags regexp)))
659 ))
660
661 ;; | alternation: start new fragment
662 ((eq type 'alternation)
663 (setq new (tNFA--NFA-state-create))
664 (push (tNFA--fragment-create new new) fragment-stack)))
665
666
667 ;; ----- attach new fragment -----
668 (when fragment
669 ;; if next token is not a postfix operator, attach new
670 ;; fragment onto end of current NFA fragment
671 (if (not (tNFA--regexp-postfix-p regexp))
672 (tNFA--fragment-patch (car fragment-stack) fragment)
673
674 ;; if next token is a postfix operator, splice new fragment
675 ;; into NFA as appropriate
676 (when (eq type 'alternation)
677 (error "Syntax error in regexp: unexpected \"%s\""
678 (char-to-string token)))
679 (setq regexp (tNFA--regexp-next-token regexp)
680 type (nth 0 regexp)
681 token (nth 1 regexp)
682 regexp (nth 2 regexp))
683
684 (while fragment
685 (setq attach (tNFA--fragment-final (car fragment-stack)))
686 (setq new (tNFA--NFA-state-create))
687 (cond
688
689 ;; * postfix = \{0,\}:
690 ;;
691 ;; .--fragment--.
692 ;; / \
693 ;; \ ______/
694 ;; \ /
695 ;; ---attach-----new---
696 ;;
697 ((and (eq (car token) 0) (null (cdr token)))
698 (tNFA--NFA-state-make-branch
699 attach (list (tNFA--fragment-initial fragment) new))
700 (tNFA--NFA-state-make-epsilon
701 (tNFA--fragment-final fragment) attach)
702 (setf (tNFA--fragment-final (car fragment-stack)) new)
703 (setq fragment nil))
704
705 ;; + postfix = \{1,\}:
706 ;;
707 ;; .----.
708 ;; / \
709 ;; / \
710 ;; \ /
711 ;; ---fragment-----new---
712 ;;
713 ((and (eq (car token) 1) (null (cdr token)))
714 (tNFA--NFA-state-patch
715 attach (tNFA--fragment-initial fragment))
716 (tNFA--NFA-state-make-branch
717 (tNFA--fragment-final fragment) (list attach new))
718 (setf (tNFA--fragment-final (car fragment-stack)) new)
719 (setq fragment nil))
720
721 ;; \{0,n\} (note: ? postfix = \{0,1\}):
722 ;;
723 ;; .--fragment--.
724 ;; / \
725 ;; ---attach new---
726 ;; \______________/
727 ;;
728 ((eq (car token) 0)
729 ;; ? postfix = \{0,1\}: after this we're done
730 (if (eq (cdr token) 1)
731 (setq copy nil)
732 (setq copy (tNFA--fragment-copy fragment)))
733 ;; attach fragment
734 (tNFA--NFA-state-make-branch
735 attach (list (tNFA--fragment-initial fragment) new))
736 (tNFA--NFA-state-make-epsilon
737 (tNFA--fragment-final fragment) new)
738 (setf (tNFA--fragment-final (car fragment-stack)) new)
739 ;; prepare for next iteration
740 (decf (cdr token))
741 (setq fragment copy))
742
743 ;; \{n,\} or \{n,m\}:
744 ;;
745 ;; ---attach----fragment----new---
746 ;;
747 (t
748 (setq copy (tNFA--fragment-copy fragment))
749 (tNFA--fragment-patch (car fragment-stack) fragment)
750 ;; prepare for next iteration
751 (decf (car token))
752 (when (cdr token) (decf (cdr token)))
753 (if (eq (cdr token) 0)
754 (setq fragment nil)
755 (setq fragment copy)))
756 )))
757
758
759 ;; if ending a group, add a maximize tag to end
760 (when group-end-tag
761 (setq new (tNFA--NFA-state-create)
762 fragment (tNFA--fragment-create
763 (tNFA--NFA-state-create-tag
764 group-end-tag new)
765 new))
766 (push group-end-tag max-tags)
767 (tNFA--fragment-patch (car fragment-stack) fragment)))
768 )) ; end of infinite loop and catch
769 ))
770
771
772
773 ;; Note: hard-coding the parsing like this is ugly, though sufficient
774 ;; for our purposes. Perhaps it would be more elegant to implement
775 ;; this in terms of a proper parser...
776
777 (defun tNFA--regexp-next-token (regexp)
778 ;; if regexp is empty, return null values for next token type, token
779 ;; and remaining regexp
780 (if (null regexp)
781 (list nil nil nil)
782
783 (let ((token (pop regexp))
784 (type 'literal)) ; assume token is literal initially
785 (cond
786
787 ;; [: gobble up to closing ]
788 ((eq token ?\[)
789 ;; character alternatives are stored in lists
790 (setq token '())
791 (cond
792 ;; gobble ] appearing straight after [
793 ((eq (car regexp) ?\]) (push (pop regexp) token))
794 ;; gobble ] appearing straight after [^
795 ((and (eq (car regexp) ?^) (eq (nth 1 regexp) ?\]))
796 (push (pop regexp) token)
797 (push (pop regexp) token)))
798 ;; gobble everything up to closing ]
799 (while (not (eq (car regexp) ?\]))
800 (push (pop regexp) token)
801 (unless regexp
802 (error "Syntax error in regexp: missing \"]\"")))
803 (pop regexp) ; dump closing ]
804 (if (not (eq (car (last token)) ?^))
805 (setq type 'char-alt)
806 (setq type 'neg-char-alt)
807 (setq token (butlast token))))
808
809 ;; ]: syntax error (always gobbled when parsing [)
810 ((eq token ?\])
811 (error "Syntax error in regexp: missing \"[\""))
812
813 ;; . * + ?: set appropriate type
814 ((eq token ?*) (setq type 'postfix token (cons 0 nil)))
815 ((eq token ?+) (setq type 'postfix token (cons 1 nil)))
816 ((eq token ??) (setq type 'postfix token (cons 0 1)))
817 ((eq token ?.) (setq type 'wildcard))
818
819 ;; \: look at next character
820 ((eq token ?\\)
821 (unless (setq token (pop regexp))
822 (error "Syntax error in regexp:\
823 missing character after \"\\\""))
824 (cond
825 ;; |: alternation
826 ((eq token ?|) (setq type 'alternation))
827 ;; \(?: shy group start
828 ((and (eq token ?\() (eq (car regexp) ??))
829 (setq type 'shy-group-start)
830 (pop regexp))
831 ;; \)?: shy group end
832 ((and (eq token ?\)) (eq (car regexp) ??))
833 (setq type 'shy-group-end)
834 (pop regexp))
835 ;; \(: group start
836 ((eq token ?\() (setq type 'group-start))
837 ;; \): group end
838 ((eq token ?\)) (setq type 'group-end))
839
840 ;; \{: postfix repetition operator
841 ((eq token ?{)
842 (setq type 'postfix token (cons nil nil))
843 ;; extract first number from repetition operator
844 (while (if (null regexp)
845 (error "Syntax error in regexp:\
846 malformed \\{...\\}")
847 (not (or (eq (car regexp) ?,)
848 (eq (car regexp) ?\\))))
849 (setcar token
850 (concat (car token) (char-to-string (pop regexp)))))
851 (if (null (car token))
852 (setcar token 0)
853 (unless (string-match "[0-9]+" (car token))
854 (error "Syntax error in regexp: malformed \\{...\\}"))
855 (setcar token (string-to-number (car token))))
856 (cond
857 ;; if next character is "\", we expect "}" to follow
858 ((eq (car regexp) ?\\)
859 (pop regexp)
860 (unless (eq (car regexp) ?})
861 (error "Syntax error in regexp: expected \"}\""))
862 (pop regexp)
863 (unless (car token)
864 (error "Syntax error in regexp: malformed \\{...\\}"))
865 (setcdr token (car token)))
866 ;; if next character is ",", we expect a second number to
867 ;; follow
868 ((eq (car regexp) ?,)
869 (pop regexp)
870 (while (if (null regexp)
871 (error "Syntax error in regexp:\
872 malformed \\{...\\}")
873 (not (eq (car regexp) ?\\)))
874 (setcdr token
875 (concat (cdr token)
876 (char-to-string (pop regexp)))))
877 (unless (null (cdr token))
878 (unless (string-match "[0-9]+" (cdr token))
879 (error "Syntax error in regexp: malformed \\{...\\}"))
880 (setcdr token (string-to-number (cdr token))))
881 (pop regexp)
882 (unless (eq (car regexp) ?})
883 (error "Syntax error in regexp: expected \"}\""))
884 (pop regexp))))
885 ))
886 )
887
888 ;; return first token type, token, and remaining regexp
889 (list type token regexp))))
890
891
892
893 ;;; ================================================================
894 ;;; tNFA evolution
895
896 (defun tNFA-next-state (tNFA chr pos)
897 "Evolve tNFA according to CHR, which corresponds to position
898 POS in a string."
899 (let (elem state)
900 ;; if there is a transition for character CHR...
901 (cond
902 ((setq elem (tNFA--assoc chr (tNFA--DFA-state-transitions tNFA)
903 :test (tNFA--DFA-state-test tNFA)))
904 ;; if next state has not already been computed, do so
905 (unless (tNFA--DFA-state-p (setq state (cdr elem)))
906 (setq state (tNFA--DFA-next-state tNFA chr pos nil))
907 (setcdr elem state)))
908
909 ;; if there's a wildcard transition...
910 ((setq state (tNFA--DFA-state-wildcard tNFA))
911 ;; if next state has not already been computed, do so
912 (unless (tNFA--DFA-state-p state)
913 (setq state (tNFA--DFA-next-state tNFA chr pos t))
914 (setf (tNFA--DFA-state-wildcard tNFA) state))))
915 state))
916
917
918
919 (defun tNFA--DFA-next-state (DFA-state chr pos wildcard)
920 (let (state-list state)
921 ;; add all states reached by a CHR transition from DFA-STATE to
922 ;; state list
923 (if wildcard
924 (dolist (state (tNFA--DFA-state-list DFA-state))
925 (when (or (eq (tNFA--state-type state) 'wildcard)
926 (and (eq (tNFA--state-type state) 'neg-char-alt)
927 (not (memq chr (tNFA--state-label state)))))
928 (push (tNFA--state-create
929 (tNFA--state-next state)
930 (tNFA--tags-copy (tNFA--state-tags state)))
931 state-list)))
932 (dolist (state (tNFA--DFA-state-list DFA-state))
933 (when (or (and (eq (tNFA--state-type state) 'literal)
934 (eq chr (tNFA--state-label state)))
935 (and (eq (tNFA--state-type state) 'char-alt)
936 (memq chr (tNFA--state-label state)))
937 (and (eq (tNFA--state-type state) 'neg-char-alt)
938 (not (memq chr (tNFA--state-label state))))
939 (eq (tNFA--state-type state) 'wildcard))
940 (push (tNFA--state-create
941 (tNFA--state-next state)
942 (tNFA--tags-copy (tNFA--state-tags state)))
943 state-list))))
944
945 ;; if state list is empty, return empty, failure DFA state
946 (when state-list
947 ;; otherwise, construct new DFA state and add it to the pool if
948 ;; it's not already there
949 (setq state-list (tNFA--epsilon-boundary state-list (1+ pos)))
950 (setq state
951 (or (gethash state-list (tNFA--DFA-state-pool DFA-state))
952 (tNFA--DFA-state-create
953 state-list
954 (tNFA--DFA-state-pool DFA-state)
955 :test (tNFA--DFA-state-test DFA-state))))
956 ;; return next state
957 state)))
958
959
960
961 (defun tNFA--epsilon-boundary (state-set pos)
962 ;; Return the tagged epsilon-boundary of the NFA states listed in
963 ;; STATE-SET, that is the set of all states that can be reached via
964 ;; epsilon transitions from some state in STATE-SET (not including
965 ;; states in STATE-SET itself).
966 (let ((queue (queue-create))
967 (result '())
968 (reset '())
969 state next tags)
970 ;; temporarily link the NFA states to their corresponding tNFA
971 ;; states, and add them to the queue
972 (dolist (t-state state-set)
973 (setf state (tNFA--state-NFA-state t-state)
974 (tNFA--NFA-state-tNFA-state state) t-state)
975 (push state reset)
976 (queue-enqueue queue state))
977
978 (while (setq state (queue-dequeue queue))
979 (cond
980 ;; branch or epsilon: add next states as necessary, copying tags
981 ;; across
982 ((or (eq (tNFA--NFA-state-type state) 'branch)
983 (eq (tNFA--NFA-state-type state) 'epsilon))
984 (dolist (next (if (eq (tNFA--NFA-state-type state) 'epsilon)
985 (list (tNFA--NFA-state-next state))
986 (tNFA--NFA-state-next state)))
987 (unless (tNFA--NFA-state-tNFA-state next)
988 (setf (tNFA--NFA-state-tNFA-state next)
989 (tNFA--state-create
990 next (tNFA--tags-copy (tNFA--NFA-state-tags state))))
991 (push next reset)
992 ;; if next state hasn't already been seen in-degree times,
993 ;; add it to the end of the queue
994 (if (/= (decf (tNFA--NFA-state-count next)) 0)
995 (queue-enqueue queue next)
996 ;; if it has now been seen in-degree times, reset count
997 ;; and add it back to the front of the queue
998 (setf (tNFA--NFA-state-count next)
999 (tNFA--NFA-state-in-degree next))
1000 (queue-prepend queue next)))))
1001
1002 ;; tag: add next state if necessary, updating tags if necessary
1003 ((eq (tNFA--NFA-state-type state) 'tag)
1004 (setq next (tNFA--NFA-state-next state))
1005 ;; if next state is not already in results list, or it is
1006 ;; already in results but new tag value takes precedence...
1007 (when (or (not (tNFA--NFA-state-tNFA-state next))
1008 (tNFA--tags< pos (tNFA--NFA-state-tag state)
1009 (tNFA--NFA-state-tags next)))
1010 ;; if next state is already in results, update tag value
1011 (if (tNFA--NFA-state-tNFA-state next)
1012 (tNFA--tags-set (tNFA--NFA-state-tags next)
1013 (tNFA--NFA-state-tag state) pos)
1014 ;; if state is not already in results, copy tags, updating
1015 ;; tag value, and add next state to results list
1016 (setq tags (tNFA--tags-copy (tNFA--NFA-state-tags state)))
1017 (tNFA--tags-set tags (tNFA--NFA-state-tag state) pos)
1018 (setf (tNFA--NFA-state-tNFA-state next)
1019 (tNFA--state-create next tags))
1020 (push next reset))
1021 ;; if next state hasn't already been seen in-degree times, add
1022 ;; it to the end of the queue
1023 (if (/= (decf (tNFA--NFA-state-count next)) 0)
1024 (queue-enqueue queue next)
1025 ;; if it has now been seen in-degree times, reset count and
1026 ;; add it back to the front of the queue
1027 (setf (tNFA--NFA-state-count next)
1028 (tNFA--NFA-state-in-degree next))
1029 (queue-prepend queue next))))
1030
1031 ;; anything else is a non-epsilon-transition state, so add it to
1032 ;; result
1033 (t (push (tNFA--NFA-state-tNFA-state state) result))
1034 ))
1035
1036 ;; reset temporary NFA state link and count
1037 (dolist (state reset)
1038 (setf (tNFA--NFA-state-tNFA-state state) nil
1039 (tNFA--NFA-state-count state)
1040 (tNFA--NFA-state-in-degree state)))
1041 ;; sort result states
1042 (sort result
1043 (lambda (a b) (< (tNFA--state-id a) (tNFA--state-id b))))
1044 ))
1045
1046
1047
1048 ;;; ================================================================
1049 ;;; tNFA matching
1050
1051 ;;;###autoload
1052 (defun* tNFA-regexp-match (regexp string &key (test 'eq))
1053 "Return non-nil if STRING matches REGEXP, nil otherwise.
1054 Sets the match data if there was a match; see `match-beginning',
1055 `match-end' and `match-string'.
1056
1057 REGEXP and STRING can be any sequence type (vector, list, or
1058 string); they need not be actual strings. Special characters in
1059 REGEXP are still just that: elements of the sequence that are
1060 characters which have a special meaning in regexps.
1061
1062 The :test keyword argument specifies how to test whether two
1063 individual elements of STRING are identical. The default is `eq'.
1064
1065 Only a subset of the full Emacs regular expression syntax is
1066 supported. There is no support for regexp constructs that are
1067 only meaningful for strings (character ranges and character
1068 classes inside character alternatives, and syntax-related
1069 backslash constructs). Back-references and non-greedy postfix
1070 operators are not supported, so `?' after a postfix operator
1071 loses its special meaning. Also, matches are always anchored, so
1072 `$' and `^' lose their special meanings (use `.*' at the
1073 beginning and end of the regexp to get an unanchored match)."
1074
1075 (let ((tNFA (tNFA-from-regexp regexp :test test))
1076 (i -1) tags match-data group-stack (grp 0))
1077
1078 ;; evolve tNFA according to characters of STRING
1079 (catch 'fail
1080 (dolist (chr (append string nil))
1081 (unless (setq tNFA (tNFA-next-state tNFA chr (incf i)))
1082 (throw 'fail nil)))
1083
1084 ;; if REGEXP matched...
1085 (when (setq tags (tNFA--DFA-state-match tNFA))
1086 (setq match-data (make-list (+ (length tags) 2) nil))
1087 ;; set match data
1088 (setf (nth 0 match-data) 0
1089 (nth 1 match-data) (length string))
1090 ;; set group match data if there were any groups
1091 (dotimes (i (length tags))
1092 (if (eq (tNFA--tags-type tags i) 'max)
1093 (unless (= (tNFA--tags-get tags i) -1)
1094 (setf (nth (1+ (* 2 (pop group-stack))) match-data)
1095 (tNFA--tags-get tags i)))
1096 (incf grp)
1097 (unless (= (tNFA--tags-get tags i) -1)
1098 (push grp group-stack)
1099 (setf (nth (* 2 grp) match-data)
1100 (tNFA--tags-get tags i)))))
1101 (set-match-data match-data)
1102 tags))))
1103
1104
1105 (defun tNFA-group-data (tNFA)
1106 "Return the group match data associated with a tNFA state."
1107 (tNFA--tags-to-groups (tNFA--DFA-state-match tNFA)))
1108
1109
1110
1111 (provide 'tNFA)
1112
1113 ;;; tNFA.el ends here