;;; rx.el --- sexp notation for regular expressions
-;; Copyright (C) 2001-2011 Free Software Foundation, Inc.
+;; Copyright (C) 2001-2016 Free Software Foundation, Inc.
;; Author: Gerd Moellmann <gerd@gnu.org>
-;; Maintainer: FSF
+;; Maintainer: emacs-devel@gnu.org
;; Keywords: strings, regexps, extensions
;; This file is part of GNU Emacs.
;; that the `repeat' form can't have multiple regexp args.
;; Now alternative forms are provided for a degree of compatibility
-;; with Shivers' attempted definitive SRE notation
-;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>. SRE forms not
-;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
+;; with Olin Shivers' attempted definitive SRE notation. SRE forms
+;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
;; ,<exp>, (word ...), word+, posix-string, and character class forms.
;; Some forms are inconsistent with SRE, either for historical reasons
;; or because of the implementation -- simple translation into Emacs
;;; Code:
-(defconst rx-constituents
+;; FIXME: support macros.
+
+(defvar rx-constituents ;Not `const' because some modes extend it.
'((and . (rx-and 1 nil))
(seq . and) ; SRE
(: . and) ; SRE
(** . (rx-** 2 nil)) ; SRE
(submatch . (rx-submatch 1 nil)) ; SRE
(group . submatch) ; sregex
+ (submatch-n . (rx-submatch-n 2 nil))
+ (group-n . submatch-n)
(zero-or-more . (rx-kleene 1 nil))
(one-or-more . (rx-kleene 1 nil))
(zero-or-one . (rx-kleene 1 nil))
(not-at-end-of-line . ?<)
(not-at-beginning-of-line . ?>)
(alpha-numeric-two-byte . ?A)
- (chinse-two-byte . ?C)
+ (chinese-two-byte . ?C)
+ (chinse-two-byte . ?C) ;; A typo in Emacs 21.1-24.3.
(greek-two-byte . ?G)
(japanese-hiragana-two-byte . ?H)
(indian-two-byte . ?I)
(defun rx-anything (form)
"Match any character."
(if (consp form)
- (error "rx `anythng' syntax error: %s" form))
+ (error "rx `anything' syntax error: %s" form))
(rx-or (list 'or 'not-newline ?\n)))
(mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
"\\)"))
+(defun rx-submatch-n (form)
+ "Parse and produce code from FORM, which is `(submatch-n N ...)'."
+ (let ((n (nth 1 form)))
+ (concat "\\(?" (number-to-string n) ":"
+ (if (= 3 (length form))
+ ;; Only one sub-form.
+ (rx-form (nth 2 form))
+ ;; Several sub-forms implicitly concatenated.
+ (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil))
+ "\\)")))
(defun rx-backref (form)
"Parse and produce code from FORM, which is `(backref N)'."
((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
((null lax)
(cond
- ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^\]]\\)*\\]\\'" r))
- ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^\)]\\)*\\\\)\\'" r)))))))
+ ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*\\]\\'" r))
+ ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r)))))))
(defun rx-syntax (form)
(defun rx-greedy (form)
"Parse and produce code from FORM.
-If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
+If FORM is `(minimal-match FORM1)', non-greedy versions of `*',
`+', and `?' operators will be used in FORM1. If FORM is
-'(maximal-match FORM1)', greedy operators will be used."
+`(maximal-match FORM1)', greedy operators will be used."
(rx-check form)
(let ((rx-greedy-flag (eq (car form) 'maximal-match)))
(rx-form (cadr form) rx-parent)))
FORM is a regular expression in sexp form.
RX-PARENT shows which type of expression calls and controls putting of
shy groups around the result and some more in other functions."
- (if (stringp form)
- (rx-group-if (regexp-quote form)
- (if (and (eq rx-parent '*) (< 1 (length form)))
- rx-parent))
- (cond ((integerp form)
- (regexp-quote (char-to-string form)))
- ((symbolp form)
- (let ((info (rx-info form nil)))
- (cond ((stringp info)
- info)
- ((null info)
- (error "Unknown rx form `%s'" form))
- (t
- (funcall (nth 0 info) form)))))
- ((consp form)
- (let ((info (rx-info (car form) 'head)))
- (unless (consp info)
- (error "Unknown rx form `%s'" (car form)))
- (funcall (nth 0 info) form)))
- (t
- (error "rx syntax error at `%s'" form)))))
+ (cond
+ ((stringp form)
+ (rx-group-if (regexp-quote form)
+ (if (and (eq rx-parent '*) (< 1 (length form)))
+ rx-parent)))
+ ((integerp form)
+ (regexp-quote (char-to-string form)))
+ ((symbolp form)
+ (let ((info (rx-info form nil)))
+ (cond ((stringp info)
+ info)
+ ((null info)
+ (error "Unknown rx form `%s'" form))
+ (t
+ (funcall (nth 0 info) form)))))
+ ((consp form)
+ (let ((info (rx-info (car form) 'head)))
+ (unless (consp info)
+ (error "Unknown rx form `%s'" (car form)))
+ (funcall (nth 0 info) form)))
+ (t
+ (error "rx syntax error at `%s'" form))))
;;;###autoload
REGEXPS is a non-empty sequence of forms of the sort listed below.
Note that `rx' is a Lisp macro; when used in a Lisp program being
- compiled, the translation is performed by the compiler.
+compiled, the translation is performed by the compiler.
See `rx-to-string' for how to do such a translation at run-time.
The following are valid subforms of regular expressions in sexp
matches space and tab only.
`graphic', `graph'
- matches graphic characters--everything except ASCII control chars,
- space, and DEL.
+ matches graphic characters--everything except whitespace, ASCII
+ and non-ASCII control characters, surrogates, and codepoints
+ unassigned by Unicode.
`printing', `print'
- matches printing characters--everything except ASCII control chars
- and DEL.
+ matches whitespace and graphic characters.
`alphanumeric', `alnum'
- matches letters and digits. (But at present, for multibyte characters,
- it matches anything that has word syntax.)
+ matches alphabetic characters and digits. (For multibyte characters,
+ it matches according to Unicode character properties.)
`letter', `alphabetic', `alpha'
- matches letters. (But at present, for multibyte characters,
- it matches anything that has word syntax.)
+ matches alphabetic characters. (For multibyte characters,
+ it matches according to Unicode character properties.)
`ascii'
matches ASCII (unibyte) characters.
`not-at-end-of-line' (\\c<)
`not-at-beginning-of-line' (\\c>)
`alpha-numeric-two-byte' (\\cA)
- `chinse-two-byte' (\\cC)
+ `chinese-two-byte' (\\cC)
`greek-two-byte' (\\cG)
`japanese-hiragana-two-byte' (\\cH)
`indian-tow-byte' (\\cI)
like `and', but makes the match accessible with `match-end',
`match-beginning', and `match-string'.
+`(submatch-n N SEXP1 SEXP2 ...)'
+`(group-n N SEXP1 SEXP2 ...)'
+ like `group', but make it an explicitly-numbered group with
+ group number N.
+
`(or SEXP1 SEXP2 ...)'
`(| SEXP1 SEXP2 ...)'
matches anything that matches SEXP1 or SEXP2, etc. If all