;;; rx.el --- sexp notation for regular expressions
-;; Copyright (C) 2001, 2002, 2003, 2004, 2005,
-;; 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
+;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
;; Author: Gerd Moellmann <gerd@gnu.org>
;; Maintainer: FSF
;; that the `repeat' form can't have multiple regexp args.
;; Now alternative forms are provided for a degree of compatibility
-;; with Shivers' attempted definitive SRE notation
-;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>. SRE forms not
-;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
+;; with Olin Shivers' attempted definitive SRE notation. SRE forms
+;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
;; ,<exp>, (word ...), word+, posix-string, and character class forms.
;; Some forms are inconsistent with SRE, either for historical reasons
;; or because of the implementation -- simple translation into Emacs
;;; Code:
-(defconst rx-constituents
+;; FIXME: support macros.
+
+(defvar rx-constituents ;Not `const' because some modes extend it.
'((and . (rx-and 1 nil))
(seq . and) ; SRE
(: . and) ; SRE
(nonl . not-newline) ; SRE
(anything . (rx-anything 0 nil))
(any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
+ (any . ".") ; sregex
(in . any)
(char . any) ; sregex
(not-char . (rx-not-char 1 nil rx-check-any)) ; sregex
(not . (rx-not 1 1 rx-check-not))
- ;; Partially consistent with sregex, whose `repeat' is like our
- ;; `**'. (`repeat' with optional max arg and multiple sexp forms
- ;; is ambiguous.)
- (repeat . (rx-repeat 2 3))
+ (repeat . (rx-repeat 2 nil))
(= . (rx-= 2 nil)) ; SRE
(>= . (rx->= 2 nil)) ; SRE
(** . (rx-** 2 nil)) ; SRE
(submatch . (rx-submatch 1 nil)) ; SRE
- (group . submatch)
+ (group . submatch) ; sregex
+ (submatch-n . (rx-submatch-n 2 nil))
+ (group-n . submatch-n)
(zero-or-more . (rx-kleene 1 nil))
(one-or-more . (rx-kleene 1 nil))
(zero-or-one . (rx-kleene 1 nil))
(category . (rx-category 1 1 rx-check-category))
(eval . (rx-eval 1 1))
(regexp . (rx-regexp 1 1 stringp))
+ (regex . regexp) ; sregex
(digit . "[[:digit:]]")
(numeric . digit) ; SRE
(num . digit) ; SRE
`zero-or-more', and `one-or-more'. Dynamically bound.")
-(defun rx-info (op)
+(defun rx-info (op head)
"Return parsing/code generation info for OP.
If OP is the space character ASCII 32, return info for the symbol `?'.
If OP is the character `?', return info for the symbol `??'.
-See also `rx-constituents'."
+See also `rx-constituents'.
+If HEAD is non-nil, then OP is the head of a sexp, otherwise it's
+a standalone symbol."
(cond ((eq op ? ) (setq op '\?))
((eq op ??) (setq op '\??)))
- (while (and (not (null op)) (symbolp op))
- (setq op (cdr (assq op rx-constituents))))
+ (let (old-op)
+ (while (and (not (null op)) (symbolp op))
+ (setq old-op op)
+ (setq op (cdr (assq op rx-constituents)))
+ (when (if head (stringp op) (consp op))
+ ;; We found something but of the wrong kind. Let's look for an
+ ;; alternate definition for the other case.
+ (let ((new-op
+ (cdr (assq old-op (cdr (memq (assq old-op rx-constituents)
+ rx-constituents))))))
+ (if (and new-op (not (if head (stringp new-op) (consp new-op))))
+ (setq op new-op))))))
op)
"Check FORM according to its car's parsing info."
(unless (listp form)
(error "rx `%s' needs argument(s)" form))
- (let* ((rx (rx-info (car form)))
+ (let* ((rx (rx-info (car form) 'head))
(nargs (1- (length form)))
(min-args (nth 1 rx))
(max-args (nth 2 rx))
(defun rx-anything (form)
"Match any character."
(if (consp form)
- (error "rx `anythng' syntax error: %s" form))
+ (error "rx `anything' syntax error: %s" form))
(rx-or (list 'or 'not-newline ?\n)))
(setcdr m (1- char)))))
ranges))
-
+
(defun rx-any-condense-range (args)
"Condense by side effect ARGS as range for Rx `any'."
(let (str
(mapcar (lambda (e)
(cond
((= (car e) (cdr e)) (list (car e)))
- ;; ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
+ ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
((list e))))
l))
(delete-dups str))))
((numberp e) (string e))
((consp e)
(if (and (= (1+ (car e)) (cdr e))
- (null (memq (car e) '(?\] ?-))))
+ ;; rx-any-condense-range should
+ ;; prevent this case from happening.
+ (null (memq (car e) '(?\] ?-)))
+ (null (memq (cdr e) '(?\] ?-))))
(string (car e) (cdr e))
(string (car e) ?- (cdr e))))
(e)))
(condition-case nil
(rx-form arg)
(error ""))))
- (eq arg 'word-boundary)
+ (eq arg 'word-boundary)
(and (consp arg)
(memq (car arg) '(not any in syntax category))))
(error "rx `not' syntax error: %s" arg))
(defun rx-** (form)
"Parse and produce code from FORM `(** N M ...)'."
(rx-check form)
- (setq form (cons 'repeat (cdr (rx-trans-forms form 2))))
- (rx-form form '*))
+ (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*))
(defun rx-repeat (form)
"Parse and produce code from FORM.
-FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'."
+FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
(rx-check form)
+ (if (> (length form) 4)
+ (setq form (rx-trans-forms form 2)))
+ (if (null (nth 2 form))
+ (setq form (cons (nth 0 form) (cons (nth 1 form) (nthcdr 3 form)))))
(cond ((= (length form) 3)
(unless (and (integerp (nth 1 form))
(> (nth 1 form) 0))
(mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
"\\)"))
+(defun rx-submatch-n (form)
+ "Parse and produce code from FORM, which is `(submatch-n N ...)'."
+ (let ((n (nth 1 form)))
+ (concat "\\(?" (number-to-string n) ":"
+ (if (= 3 (length form))
+ ;; Only one sub-form.
+ (rx-form (nth 2 form))
+ ;; Several sub-forms implicitly concatenated.
+ (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil))
+ "\\)")))
(defun rx-backref (form)
"Parse and produce code from FORM, which is `(backref N)'."
"Parse and produce code from FORM, which is `(syntax SYMBOL)'."
(rx-check form)
(let* ((sym (cadr form))
- (syntax (assq sym rx-syntax)))
+ (syntax (cdr (assq sym rx-syntax))))
(unless syntax
;; Try sregex compatibility.
- (let ((name (symbol-name sym)))
- (if (= 1 (length name))
- (setq syntax (rassq (aref name 0) rx-syntax))))
+ (cond
+ ((characterp sym) (setq syntax sym))
+ ((symbolp sym)
+ (let ((name (symbol-name sym)))
+ (if (= 1 (length name))
+ (setq syntax (aref name 0))))))
(unless syntax
- (error "Unknown rx syntax `%s'" (cadr form))))
- (format "\\s%c" (cdr syntax))))
+ (error "Unknown rx syntax `%s'" sym)))
+ (format "\\s%c" syntax)))
(defun rx-check-category (form)
FORM is a regular expression in sexp form.
RX-PARENT shows which type of expression calls and controls putting of
shy groups around the result and some more in other functions."
- (if (stringp form)
- (rx-group-if (regexp-quote form)
- (if (and (eq rx-parent '*) (< 1 (length form)))
- rx-parent))
- (cond ((integerp form)
- (regexp-quote (char-to-string form)))
- ((symbolp form)
- (let ((info (rx-info form)))
- (cond ((stringp info)
- info)
- ((null info)
- (error "Unknown rx form `%s'" form))
- (t
- (funcall (nth 0 info) form)))))
- ((consp form)
- (let ((info (rx-info (car form))))
- (unless (consp info)
- (error "Unknown rx form `%s'" (car form)))
- (funcall (nth 0 info) form)))
- (t
- (error "rx syntax error at `%s'" form)))))
+ (cond
+ ((stringp form)
+ (rx-group-if (regexp-quote form)
+ (if (and (eq rx-parent '*) (< 1 (length form)))
+ rx-parent)))
+ ((integerp form)
+ (regexp-quote (char-to-string form)))
+ ((symbolp form)
+ (let ((info (rx-info form nil)))
+ (cond ((stringp info)
+ info)
+ ((null info)
+ (error "Unknown rx form `%s'" form))
+ (t
+ (funcall (nth 0 info) form)))))
+ ((consp form)
+ (let ((info (rx-info (car form) 'head)))
+ (unless (consp info)
+ (error "Unknown rx form `%s'" (car form)))
+ (funcall (nth 0 info) form)))
+ (t
+ (error "rx syntax error at `%s'" form))))
;;;###autoload
REGEXPS is a non-empty sequence of forms of the sort listed below.
Note that `rx' is a Lisp macro; when used in a Lisp program being
- compiled, the translation is performed by the compiler.
+compiled, the translation is performed by the compiler.
See `rx-to-string' for how to do such a translation at run-time.
The following are valid subforms of regular expressions in sexp
like `and', but makes the match accessible with `match-end',
`match-beginning', and `match-string'.
+`(submatch-n N SEXP1 SEXP2 ...)'
+`(group-n N SEXP1 SEXP2 ...)'
+ like `group', but make it an explicitly-numbered group with
+ group number N.
+
`(or SEXP1 SEXP2 ...)'
`(| SEXP1 SEXP2 ...)'
matches anything that matches SEXP1 or SEXP2, etc. If all
\f
(provide 'rx)
-;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b
;;; rx.el ends here