X-Git-Url: https://code.delx.au/gnu-emacs/blobdiff_plain/4382021fc15f8a52c9dd12bacb7c75dbeb562302..af7c7572ce8d87f51817d0f518d1b0aced074a41:/lisp/emacs-lisp/rx.el diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index 6caa77220b..5d04494ecb 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -1,7 +1,7 @@ ;;; rx.el --- sexp notation for regular expressions ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, -;; 2006, 2007 Free Software Foundation, Inc. +;; 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. ;; Author: Gerd Moellmann ;; Maintainer: FSF @@ -9,10 +9,10 @@ ;; This file is part of GNU Emacs. -;; GNU Emacs is free software; you can redistribute it and/or modify +;; GNU Emacs is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by -;; the Free Software Foundation; either version 2, or (at your option) -;; any later version. +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. ;; GNU Emacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -20,9 +20,7 @@ ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License -;; along with GNU Emacs; see the file COPYING. If not, write to the -;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, -;; Boston, MA 02110-1301, USA. +;; along with GNU Emacs. If not, see . ;;; Commentary: @@ -120,7 +118,7 @@ (| . or) ; SRE (not-newline . ".") (nonl . not-newline) ; SRE - (anything . "\\(?:.\\|\n\\)") + (anything . (rx-anything 0 nil)) (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE (in . any) (char . any) ; sregex @@ -208,8 +206,7 @@ (upper-case . upper) ; SRE (word . "[[:word:]]") ; inconsistent with SRE (wordchar . word) ; sregex - (not-wordchar . "[^[:word:]]") ; sregex (use \\W?) - ) + (not-wordchar . "\\W")) "Alist of sexp form regexp constituents. Each element of the alist has the form (SYMBOL . DEFN). SYMBOL is a valid constituent of sexp regular expressions. @@ -334,82 +331,237 @@ See also `rx-constituents'." (car form) type-pred)))))) +(defun rx-group-if (regexp group) + "Put shy groups around REGEXP if seemingly necessary when GROUP +is non-nil." + (cond + ;; for some repetition + ((eq group '*) (if (rx-atomic-p regexp) (setq group nil))) + ;; for concatenation + ((eq group ':) + (if (rx-atomic-p + (if (string-match + "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp) + (substring regexp 0 (match-beginning 0)) + regexp)) + (setq group nil))) + ;; for OR + ((eq group '|) (setq group nil)) + ;; do anyway + ((eq group t)) + ((rx-atomic-p regexp t) (setq group nil))) + (if group + (concat "\\(?:" regexp "\\)") + regexp)) + + +(defvar rx-parent) +;; dynamically bound in some functions. + + (defun rx-and (form) "Parse and produce code from FORM. FORM is of the form `(and FORM1 ...)'." (rx-check form) - (concat "\\(?:" - (mapconcat - (function (lambda (x) (rx-to-string x 'no-group))) - (cdr form) nil) - "\\)")) + (rx-group-if + (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil) + (and (memq rx-parent '(* t)) rx-parent))) (defun rx-or (form) "Parse and produce code from FORM, which is `(or FORM1 ...)'." (rx-check form) - (let ((all-args-strings t)) - (dolist (arg (cdr form)) - (unless (stringp arg) - (setq all-args-strings nil))) - (concat "\\(?:" - (if all-args-strings - (regexp-opt (cdr form)) - (mapconcat #'rx-to-string (cdr form) "\\|")) - "\\)"))) + (rx-group-if + (if (memq nil (mapcar 'stringp (cdr form))) + (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|") + (regexp-opt (cdr form))) + (and (memq rx-parent '(: * t)) rx-parent))) + + +(defun rx-anything (form) + "Match any character." + (if (consp form) + (error "rx `anythng' syntax error: %s" form)) + (rx-or (list 'or 'not-newline ?\n))) + + +(defun rx-any-delete-from-range (char ranges) + "Delete by side effect character CHAR from RANGES. +Only both edges of each range is checked." + (let (m) + (cond + ((memq char ranges) (setq ranges (delq char ranges))) + ((setq m (assq char ranges)) + (if (eq (1+ char) (cdr m)) + (setcar (memq m ranges) (1+ char)) + (setcar m (1+ char)))) + ((setq m (rassq char ranges)) + (if (eq (1- char) (car m)) + (setcar (memq m ranges) (1- char)) + (setcdr m (1- char))))) + ranges)) + + +(defun rx-any-condense-range (args) + "Condense by side effect ARGS as range for Rx `any'." + (let (str + l) + ;; set STR list of all strings + ;; set L list of all ranges + (mapc (lambda (e) (cond ((stringp e) (push e str)) + ((numberp e) (push (cons e e) l)) + (t (push e l)))) + args) + ;; condense overlapped ranges in L + (let ((tail (setq l (sort l #'car-less-than-car))) + d) + (while (setq d (cdr tail)) + (if (>= (cdar tail) (1- (caar d))) + (progn + (setcdr (car tail) (max (cdar tail) (cdar d))) + (setcdr tail (cdr d))) + (setq tail d)))) + ;; Separate small ranges to single number, and delete dups. + (nconc + (apply #'nconc + (mapcar (lambda (e) + (cond + ((= (car e) (cdr e)) (list (car e))) + ;; ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e))) + ((list e)))) + l)) + (delete-dups str)))) + + +(defun rx-check-any-string (str) + "Check string argument STR for Rx `any'." + (let ((i 0) + c1 c2 l) + (if (= 0 (length str)) + (error "String arg for Rx `any' must not be empty")) + (while (string-match ".-." str i) + ;; string before range: convert it to characters + (if (< i (match-beginning 0)) + (setq l (nconc + l + (append (substring str i (match-beginning 0)) nil)))) + ;; range + (setq i (match-end 0) + c1 (aref str (match-beginning 0)) + c2 (aref str (1- i))) + (cond + ((< c1 c2) (setq l (nconc l (list (cons c1 c2))))) + ((= c1 c2) (setq l (nconc l (list c1)))))) + ;; rest? + (if (< i (length str)) + (setq l (nconc l (append (substring str i) nil)))) + l)) -(defvar rx-bracket) ; dynamically bound in `rx-any' - (defun rx-check-any (arg) "Check arg ARG for Rx `any'." - (if (integerp arg) - (setq arg (string arg))) - (when (stringp arg) - (if (zerop (length arg)) - (error "String arg for Rx `any' must not be empty")) - ;; Quote ^ at start; don't bother to check whether this is first arg. - (if (eq ?^ (aref arg 0)) - (setq arg (concat "\\" arg))) - ;; Remove ] and set flag for adding it to start of overall result. - (when (string-match "\\]" arg) - (setq arg (replace-regexp-in-string "\\]" "" arg) - rx-bracket "]"))) - (when (symbolp arg) + (cond + ((integerp arg) (list arg)) + ((symbolp arg) (let ((translation (condition-case nil - (rx-to-string arg 'no-group) + (rx-form arg) (error nil)))) - (unless translation (error "Invalid char class `%s' in Rx `any'" arg)) - (setq arg (substring translation 1 -1)))) ; strip outer brackets - ;; sregex compatibility - (when (and (integerp (car-safe arg)) - (integerp (cdr-safe arg))) - (setq arg (string (car arg) ?- (cdr arg)))) - (unless (stringp arg) - (error "rx `any' requires string, character, char pair or char class args")) - arg) + (if (or (null translation) + (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation))) + (error "Invalid char class `%s' in Rx `any'" arg)) + (list (substring translation 1 -1)))) ; strip outer brackets + ((and (integerp (car-safe arg)) (integerp (cdr-safe arg))) + (list arg)) + ((stringp arg) (rx-check-any-string arg)) + ((error + "rx `any' requires string, character, char pair or char class args")))) + (defun rx-any (form) "Parse and produce code from FORM, which is `(any ARG ...)'. ARG is optional." (rx-check form) - (let* ((rx-bracket nil) - (args (mapcar #'rx-check-any (cdr form)))) ; side-effects `rx-bracket' - ;; If there was a ?- in the form, move it to the front to avoid - ;; accidental range. - (if (member "-" args) - (setq args (cons "-" (delete "-" args)))) - (apply #'concat "[" rx-bracket (append args '("]"))))) + (let* ((args (rx-any-condense-range + (apply + #'nconc + (mapcar #'rx-check-any (cdr form))))) + m + s) + (cond + ;; single close bracket + ;; => "[]...-]" or "[]...--.]" + ((memq ?\] args) + ;; set ] at the beginning + (setq args (cons ?\] (delq ?\] args))) + ;; set - at the end + (if (or (memq ?- args) (assq ?- args)) + (setq args (nconc (rx-any-delete-from-range ?- args) + (list ?-))))) + ;; close bracket starts a range + ;; => "[]-....-]" or "[]-.--....]" + ((setq m (assq ?\] args)) + ;; bring it to the beginning + (setq args (cons m (delq m args))) + (cond ((memq ?- args) + ;; to the end + (setq args (nconc (delq ?- args) (list ?-)))) + ((setq m (assq ?- args)) + ;; next to the bracket's range, make the second range + (setcdr args (cons m (delq m args)))))) + ;; bracket in the end range + ;; => "[]...-]" + ((setq m (rassq ?\] args)) + ;; set ] at the beginning + (setq args (cons ?\] (rx-any-delete-from-range ?\] args))) + ;; set - at the end + (if (or (memq ?- args) (assq ?- args)) + (setq args (nconc (rx-any-delete-from-range ?- args) + (list ?-))))) + ;; {no close bracket appears} + ;; + ;; bring single bar to the beginning + ((memq ?- args) + (setq args (cons ?- (delq ?- args)))) + ;; bar start a range, bring it to the beginning + ((setq m (assq ?- args)) + (setq args (cons m (delq m args)))) + ;; + ;; hat at the beginning? + ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^)) + (setq args (if (cdr args) + `(,(cadr args) ,(car args) ,@(cddr args)) + (nconc (rx-any-delete-from-range ?^ args) + (list ?^)))))) + ;; some 1-char? + (if (and (null (cdr args)) (numberp (car args)) + (or (= 1 (length + (setq s (regexp-quote (string (car args)))))) + (and (equal (car args) ?^) ;; unnecessary predicate? + (null (eq rx-parent '!))))) + s + (concat "[" + (mapconcat + (lambda (e) (cond + ((numberp e) (string e)) + ((consp e) + (if (and (= (1+ (car e)) (cdr e)) + (null (memq (car e) '(?\] ?-)))) + (string (car e) (cdr e)) + (string (car e) ?- (cdr e)))) + (e))) + args + nil) + "]")))) (defun rx-check-not (arg) "Check arg ARG for Rx `not'." (unless (or (and (symbolp arg) - (string-match "\\`\\[\\[:[-a-z]:\\]\\]\\'" + (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" (condition-case nil - (rx-to-string arg 'no-group) + (rx-form arg) (error "")))) - (eq arg 'word-boundary) + (eq arg 'word-boundary) (and (consp arg) (memq (car arg) '(not any in syntax category)))) (error "rx `not' syntax error: %s" arg)) @@ -419,16 +571,22 @@ ARG is optional." (defun rx-not (form) "Parse and produce code from FORM. FORM is `(not ...)'." (rx-check form) - (let ((result (rx-to-string (cadr form) 'no-group)) + (let ((result (rx-form (cadr form) '!)) case-fold-search) (cond ((string-match "\\`\\[^" result) - (if (= (length result) 4) - (substring result 2 3) - (concat "[" (substring result 2)))) + (cond + ((equal result "[^]") "[^^]") + ((and (= (length result) 4) (null (eq rx-parent '!))) + (regexp-quote (substring result 2 3))) + ((concat "[" (substring result 2))))) ((eq ?\[ (aref result 0)) (concat "[^" (substring result 1))) - ((string-match "\\`\\\\[scb]" result) - (concat (capitalize (substring result 0 2)) (substring result 2))) + ((string-match "\\`\\\\[scbw]" result) + (concat (upcase (substring result 0 2)) + (substring result 2))) + ((string-match "\\`\\\\[SCBW]" result) + (concat (downcase (substring result 0 2)) + (substring result 2))) (t (concat "[^" result "]"))))) @@ -466,7 +624,7 @@ If SKIP is non-nil, allow that number of items after the head, i.e. (unless (and (integerp (nth 1 form)) (> (nth 1 form) 0)) (error "rx `=' requires positive integer first arg")) - (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form))) + (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form))) (defun rx->= (form) @@ -476,14 +634,14 @@ If SKIP is non-nil, allow that number of items after the head, i.e. (unless (and (integerp (nth 1 form)) (> (nth 1 form) 0)) (error "rx `>=' requires positive integer first arg")) - (format "%s\\{%d,\\}" (rx-to-string (nth 2 form)) (nth 1 form))) + (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form))) (defun rx-** (form) "Parse and produce code from FORM `(** N M ...)'." (rx-check form) (setq form (cons 'repeat (cdr (rx-trans-forms form 2)))) - (rx-to-string form)) + (rx-form form '*)) (defun rx-repeat (form) @@ -494,7 +652,7 @@ FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'." (unless (and (integerp (nth 1 form)) (> (nth 1 form) 0)) (error "rx `repeat' requires positive integer first arg")) - (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form))) + (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form))) ((or (not (integerp (nth 2 form))) (< (nth 2 form) 0) (not (integerp (nth 1 form))) @@ -502,16 +660,20 @@ FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'." (< (nth 2 form) (nth 1 form))) (error "rx `repeat' range error")) (t - (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form)) + (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*) (nth 1 form) (nth 2 form))))) (defun rx-submatch (form) "Parse and produce code from FORM, which is `(submatch ...)'." (concat "\\(" - (mapconcat (function (lambda (x) (rx-to-string x 'no-group))) - (cdr form) nil) - "\\)")) + (if (= 2 (length form)) + ;; Only one sub-form. + (rx-form (cadr form)) + ;; Several sub-forms implicitly concatenated. + (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil)) + "\\)")) + (defun rx-backref (form) "Parse and produce code from FORM, which is `(backref N)'." @@ -533,19 +695,19 @@ If OP is anything else, produce a greedy regexp if `rx-greedy-flag' is non-nil." (rx-check form) (setq form (rx-trans-forms form)) - (let ((suffix (cond ((memq (car form) '(* + ? )) "") + (let ((suffix (cond ((memq (car form) '(* + ?\s)) "") ((memq (car form) '(*? +? ??)) "?") (rx-greedy-flag "") (t "?"))) (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*") ((memq (car form) '(+ +? 1+ one-or-more)) "+") - (t "?"))) - (result (rx-to-string (cadr form) 'no-group))) - (if (not (rx-atomic-p result)) - (setq result (concat "\\(?:" result "\\)"))) - (concat result op suffix))) + (t "?")))) + (rx-group-if + (concat (rx-form (cadr form) '*) op suffix) + (and (memq rx-parent '(t *)) rx-parent)))) + -(defun rx-atomic-p (r) +(defun rx-atomic-p (r &optional lax) "Return non-nil if regexp string R is atomic. An atomic regexp R is one such that a suffix operator appended to R will apply to all of R. For example, \"a\" @@ -554,7 +716,7 @@ appended to R will apply to all of R. For example, \"a\" This function may return false negatives, but it will not return false positives. It is nevertheless useful in -situations where an efficiency shortcut can be taken iff a +situations where an efficiency shortcut can be taken only if a regexp is atomic. The function can be improved to detect more cases of atomic regexps. Presently, this function detects the following categories of atomic regexp; @@ -570,13 +732,14 @@ be detected without much effort. A guarantee of no false negatives would require a theoretic specification of the set of all atomic regexps." (let ((l (length r))) - (or (equal l 1) - (and (>= l 6) - (equal (substring r 0 2) "\\(") - (equal (substring r -2) "\\)")) - (and (>= l 2) - (equal (substring r 0 1) "[") - (equal (substring r -1) "]"))))) + (cond + ((<= l 1)) + ((= l 2) (= (aref r 0) ?\\)) + ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r)) + ((null lax) + (cond + ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^\]]\\)*\\]\\'" r)) + ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^\)]\\)*\\\\)\\'" r))))))) (defun rx-syntax (form) @@ -614,7 +777,7 @@ of all atomic regexps." (defun rx-eval (form) "Parse and produce code from FORM, which is `(eval FORM)'." (rx-check form) - (rx-to-string (eval (cadr form)))) + (rx-form (eval (cadr form)) rx-parent)) (defun rx-greedy (form) @@ -624,13 +787,41 @@ If FORM is '(minimal-match FORM1)', non-greedy versions of `*', '(maximal-match FORM1)', greedy operators will be used." (rx-check form) (let ((rx-greedy-flag (eq (car form) 'maximal-match))) - (rx-to-string (cadr form)))) + (rx-form (cadr form) rx-parent))) (defun rx-regexp (form) "Parse and produce code from FORM, which is `(regexp STRING)'." (rx-check form) - (concat "\\(?:" (cadr form) "\\)")) + (rx-group-if (cadr form) rx-parent)) + + +(defun rx-form (form &optional rx-parent) + "Parse and produce code for regular expression FORM. +FORM is a regular expression in sexp form. +RX-PARENT shows which type of expression calls and controls putting of +shy groups around the result and some more in other functions." + (if (stringp form) + (rx-group-if (regexp-quote form) + (if (and (eq rx-parent '*) (< 1 (length form))) + rx-parent)) + (cond ((integerp form) + (regexp-quote (char-to-string form))) + ((symbolp form) + (let ((info (rx-info form))) + (cond ((stringp info) + info) + ((null info) + (error "Unknown rx form `%s'" form)) + (t + (funcall (nth 0 info) form))))) + ((consp form) + (let ((info (rx-info (car form)))) + (unless (consp info) + (error "Unknown rx form `%s'" (car form))) + (funcall (nth 0 info) form))) + (t + (error "rx syntax error at `%s'" form))))) ;;;###autoload @@ -638,35 +829,17 @@ If FORM is '(minimal-match FORM1)', non-greedy versions of `*', "Parse and produce code for regular expression FORM. FORM is a regular expression in sexp form. NO-GROUP non-nil means don't put shy groups around the result." - (cond ((stringp form) - (regexp-quote form)) - ((integerp form) - (regexp-quote (char-to-string form))) - ((symbolp form) - (let ((info (rx-info form))) - (cond ((stringp info) - info) - ((null info) - (error "Unknown rx form `%s'" form)) - (t - (funcall (nth 0 info) form))))) - ((consp form) - (let ((info (rx-info (car form)))) - (unless (consp info) - (error "Unknown rx form `%s'" (car form))) - (let ((result (funcall (nth 0 info) form))) - (if (or no-group (string-match "\\`\\\\[(]" result)) - result - (concat "\\(?:" result "\\)"))))) - (t - (error "rx syntax error at `%s'" form)))) + (rx-group-if (rx-form form) (null no-group))) ;;;###autoload (defmacro rx (&rest regexps) "Translate regular expressions REGEXPS in sexp form to a regexp string. REGEXPS is a non-empty sequence of forms of the sort listed below. -See also `rx-to-string' for how to do such a translation at run-time. + +Note that `rx' is a Lisp macro; when used in a Lisp program being + compiled, the translation is performed by the compiler. +See `rx-to-string' for how to do such a translation at run-time. The following are valid subforms of regular expressions in sexp notation. @@ -679,7 +852,7 @@ CHAR `not-newline', `nonl' matches any character except a newline. - . + `anything' matches any character @@ -945,15 +1118,9 @@ enclosed in `(and ...)'. `(** N M SEXP ...)' matches N to M occurrences. -`(backref N)' - matches what was matched previously by submatch N. - `(backref N)' matches what was matched previously by submatch N. -`(backref N)' - matches what was matched previously by submatch N. - `(eval FORM)' evaluate FORM and insert result. If result is a string, `regexp-quote' it.