; Merge from origin/emacs-25

[gnu-emacs] / lisp / emacs-lisp / rx.el
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el

index 7122de4789cc64b4b8dfbbc082d17a7361649d1e..66d295e221f77deab3b08a47b7fcbe564fa122bf 100644 (file)
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -1,9 +1,9 @@
  ;;; rx.el --- sexp notation for regular expressions
  
-;; Copyright (C) 2001-2011 Free Software Foundation, Inc.
+;; Copyright (C) 2001-2016 Free Software Foundation, Inc.
  
  ;; Author: Gerd Moellmann <gerd@gnu.org>
-;; Maintainer: FSF
+;; Maintainer: emacs-devel@gnu.org
  ;; Keywords: strings, regexps, extensions
  
  ;; This file is part of GNU Emacs.
@@ -35,9 +35,8 @@
  ;; that the `repeat' form can't have multiple regexp args.
  
  ;; Now alternative forms are provided for a degree of compatibility
-;; with Shivers' attempted definitive SRE notation
-;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>.  SRE forms not
-;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
+;; with Olin Shivers' attempted definitive SRE notation.  SRE forms
+;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
  ;; ,<exp>, (word ...), word+, posix-string, and character class forms.
  ;; Some forms are inconsistent with SRE, either for historical reasons
  ;; or because of the implementation -- simple translation into Emacs
@@ -108,7 +107,9 @@
  
  ;;; Code:
  
-(defconst rx-constituents
+;; FIXME: support macros.
+
+(defvar rx-constituents              ;Not `const' because some modes extend it.
    '((and               . (rx-and 1 nil))
      (seq               . and)          ; SRE
      (:                 . and)          ; SRE
@@ -130,6 +131,8 @@
      (**                        . (rx-** 2 nil))   ; SRE
      (submatch          . (rx-submatch 1 nil)) ; SRE
      (group             . submatch)     ; sregex
+    (submatch-n                . (rx-submatch-n 2 nil))
+    (group-n           . submatch-n)
      (zero-or-more      . (rx-kleene 1 nil))
      (one-or-more       . (rx-kleene 1 nil))
      (zero-or-one       . (rx-kleene 1 nil))
@@ -255,7 +258,8 @@ regular expressions.")
      (not-at-end-of-line                . ?<)
      (not-at-beginning-of-line  . ?>)
      (alpha-numeric-two-byte    . ?A)
-    (chinse-two-byte           . ?C)
+    (chinese-two-byte          . ?C)
+    (chinse-two-byte           . ?C) ;; A typo in Emacs 21.1-24.3.
      (greek-two-byte            . ?G)
      (japanese-hiragana-two-byte . ?H)
      (indian-two-byte           . ?I)
@@ -391,7 +395,7 @@ FORM is of the form `(and FORM1 ...)'."
  (defun rx-anything (form)
    "Match any character."
    (if (consp form)
-      (error "rx `anythng' syntax error: %s" form))
+      (error "rx `anything' syntax error: %s" form))
    (rx-or (list 'or 'not-newline ?\n)))
  
  
@@ -690,6 +694,16 @@ FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
              (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
            "\\)"))
  
+(defun rx-submatch-n (form)
+  "Parse and produce code from FORM, which is `(submatch-n N ...)'."
+  (let ((n (nth 1 form)))
+    (concat "\\(?" (number-to-string n) ":"
+           (if (= 3 (length form))
+               ;; Only one sub-form.
+               (rx-form (nth 2 form))
+             ;; Several sub-forms implicitly concatenated.
+             (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil))
+           "\\)")))
  
  (defun rx-backref (form)
    "Parse and produce code from FORM, which is `(backref N)'."
@@ -754,8 +768,8 @@ of all atomic regexps."
       ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
       ((null lax)
        (cond
-       ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^\]]\\)*\\]\\'" r))
-       ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^\)]\\)*\\\\)\\'" r)))))))
+       ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*\\]\\'" r))
+       ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r)))))))
  
  
  (defun rx-syntax (form)
@@ -801,9 +815,9 @@ of all atomic regexps."
  
  (defun rx-greedy (form)
    "Parse and produce code from FORM.
-If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
+If FORM is `(minimal-match FORM1)', non-greedy versions of `*',
  `+', and `?' operators will be used in FORM1.  If FORM is
-'(maximal-match FORM1)', greedy operators will be used."
+`(maximal-match FORM1)', greedy operators will be used."
    (rx-check form)
    (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
      (rx-form (cadr form) rx-parent)))
@@ -820,27 +834,28 @@ If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
  FORM is a regular expression in sexp form.
  RX-PARENT shows which type of expression calls and controls putting of
  shy groups around the result and some more in other functions."
-  (if (stringp form)
-      (rx-group-if (regexp-quote form)
-                  (if (and (eq rx-parent '*) (< 1 (length form)))
-                      rx-parent))
-    (cond ((integerp form)
-          (regexp-quote (char-to-string form)))
-         ((symbolp form)
-          (let ((info (rx-info form nil)))
-            (cond ((stringp info)
-                   info)
-                  ((null info)
-                   (error "Unknown rx form `%s'" form))
-                  (t
-                   (funcall (nth 0 info) form)))))
-         ((consp form)
-          (let ((info (rx-info (car form) 'head)))
-            (unless (consp info)
-              (error "Unknown rx form `%s'" (car form)))
-            (funcall (nth 0 info) form)))
-         (t
-          (error "rx syntax error at `%s'" form)))))
+  (cond
+   ((stringp form)
+    (rx-group-if (regexp-quote form)
+                 (if (and (eq rx-parent '*) (< 1 (length form)))
+                     rx-parent)))
+   ((integerp form)
+    (regexp-quote (char-to-string form)))
+   ((symbolp form)
+    (let ((info (rx-info form nil)))
+      (cond ((stringp info)
+             info)
+            ((null info)
+             (error "Unknown rx form `%s'" form))
+            (t
+             (funcall (nth 0 info) form)))))
+   ((consp form)
+    (let ((info (rx-info (car form) 'head)))
+      (unless (consp info)
+        (error "Unknown rx form `%s'" (car form)))
+      (funcall (nth 0 info) form)))
+   (t
+    (error "rx syntax error at `%s'" form))))
  
  
  ;;;###autoload
@@ -857,7 +872,7 @@ NO-GROUP non-nil means don't put shy groups around the result."
  REGEXPS is a non-empty sequence of forms of the sort listed below.
  
  Note that `rx' is a Lisp macro; when used in a Lisp program being
- compiled, the translation is performed by the compiler.
+compiled, the translation is performed by the compiler.
  See `rx-to-string' for how to do such a translation at run-time.
  
  The following are valid subforms of regular expressions in sexp
@@ -950,20 +965,20 @@ CHAR
       matches space and tab only.
  
  `graphic', `graph'
-     matches graphic characters--everything except ASCII control chars,
-     space, and DEL.
+     matches graphic characters--everything except whitespace, ASCII
+     and non-ASCII control characters, surrogates, and codepoints
+     unassigned by Unicode.
  
  `printing', `print'
-     matches printing characters--everything except ASCII control chars
-     and DEL.
+     matches whitespace and graphic characters.
  
  `alphanumeric', `alnum'
-     matches letters and digits.  (But at present, for multibyte characters,
-     it matches anything that has word syntax.)
+     matches alphabetic characters and digits.  (For multibyte characters,
+     it matches according to Unicode character properties.)
  
  `letter', `alphabetic', `alpha'
-     matches letters.  (But at present, for multibyte characters,
-     it matches anything that has word syntax.)
+     matches alphabetic characters.  (For multibyte characters,
+     it matches according to Unicode character properties.)
  
  `ascii'
       matches ASCII (unibyte) characters.
@@ -1031,7 +1046,7 @@ CHAR
       `not-at-end-of-line'              (\\c<)
       `not-at-beginning-of-line'                (\\c>)
       `alpha-numeric-two-byte'          (\\cA)
-     `chinse-two-byte'                 (\\cC)
+     `chinese-two-byte'                        (\\cC)
       `greek-two-byte'                  (\\cG)
       `japanese-hiragana-two-byte'      (\\cH)
       `indian-tow-byte'                 (\\cI)
@@ -1072,6 +1087,11 @@ CHAR
       like `and', but makes the match accessible with `match-end',
       `match-beginning', and `match-string'.
  
+`(submatch-n N SEXP1 SEXP2 ...)'
+`(group-n N SEXP1 SEXP2 ...)'
+     like `group', but make it an explicitly-numbered group with
+     group number N.
+
  `(or SEXP1 SEXP2 ...)'
  `(| SEXP1 SEXP2 ...)'
       matches anything that matches SEXP1 or SEXP2, etc.  If all