;;; lex-parse-re.el --- Parse Emacs regexps using Lex ;; Copyright (C) 2008,2013 Free Software Foundation, Inc. ;; Author: Stefan Monnier ;; Keywords: ;; This program is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or ;; (at your option) any later version. ;; This program is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with this program. If not, see . ;;; Commentary: ;; This exports lex-parse-re, but it also defines lex--parse-charset which is ;; used internally by lex-compile to handle charsets specified as a string. ;;; Code: (require 'lex) ;;; Regexp parsers. (defun lex--tokenizer (lex string) (let ((tokens ()) (i 0) tmp) (while (and (< i (length string)) (setq tmp (lex-match-string lex string i))) (push (cons (car tmp) (substring string i (setq i (cadr tmp)))) tokens)) (nreverse tokens))) (defun lex--parse-charset (string) (let ((i 0) (ranges ())) (when (eq (aref string i) ?^) (push 'not ranges) (setq i (1+ i))) (let ((op nil) (case-fold-search nil)) (while (not (eq op 'stop)) (lex-case string i ((seq "[:" (0+ (char (?a . ?z) (?A . ?Z))) ":]") (push (intern (substring string (+ 2 (match-beginning 0)) (- (match-end 0) 2))) ranges)) ((seq anything "-" anything) (push (cons (aref string (match-beginning 0)) (aref string (1- (match-end 0)))) ranges)) (anything (push (aref string (1- (match-end 0))) ranges)) (eob (setq op 'stop)))) `(char ,@(nreverse ranges))))) (defconst lex--parse-re-lexspec '(((or "*" "+" "?" "*?" "+?" "??") . suffix) ((seq "[" (opt "^") (opt "]") (0+ (or (seq (char not ?\]) "-" (char not ?\])) (seq "[:" (1+ (char (?a . ?z) (?A . ?Z))) ":]") (char not ?\]))) "]") . charset) ((seq "\\c" anything) . category) ((seq "\\C" anything) . not-category) ((seq "\\s" anything) . syntax) ((seq "\\S" anything) . not-syntax) ((seq "\\" (char (?1 . ?9))) . backref) ("\\'" . eob) ("\\`" . bob) ("." . dot) ("^" . bol) ("$" . eol) ("." . dot) ("\\<" . bow) ("\\>" . eow) ("\\_<" . symbol-start) ("\\_>" . symbol-end) ("\\w" . wordchar) ("\\W" . not-wordchar) ("\\b" . word-boundary) ("\\B" . not-word-boundary) ("\\=" . point) ((or (seq ?\\ anything) anything) . char))) (defconst lex--parse-ere-lexer (let ((case-fold-search nil)) (lex-compile (append '(("(?:" . shy-group) ("|" . or) ((seq "{" (0+ (char (?0 . ?9))) (opt (seq "," (0+ (char (?0 . ?9))))) "}") . repeat) ((or ")" eob) . stop) ("(" . group)) lex--parse-re-lexspec)))) (defconst lex--parse-bre-lexer (let ((case-fold-search nil)) (lex-compile (append '(("\\(?:" . shy-group) ("\\|" . or) ((seq "\\{" (0+ (char (?0 . ?9))) (opt (seq "," (0+ (char (?0 . ?9))))) "\\}") . repeat) ((or "\\)" eob) . stop) ("\\(" . group)) lex--parse-re-lexspec)))) (defun lex--parse-re (string i lexer) (let ((stack ()) (op nil) (res nil) tmp) (while (and (not (eq op 'stop)) (setq tmp (lex-match-string lexer string i))) (pcase (car tmp) (`shy-group (setq tmp (lex--parse-re string (cadr tmp) lexer)) (unless (eq (aref string (1- (cadr tmp))) ?\)) (error "Unclosed shy-group")) (push (car tmp) res)) (`group (setq tmp (lex--parse-re string (cadr tmp) lexer)) (unless (eq (aref string (1- (cadr tmp))) ?\)) (error "Unclosed group")) (push (list 'group (car tmp)) res)) (`suffix (if (null res) (error "Non-prefixed suffix operator") (setq res (cons (list (cdr (assoc (substring string i (cadr tmp)) '(("*" . 0+) ("+" . 1+) ("?" . opt) ("*?" . *\?) ("+?" . +\?) ("??" . \?\?)))) (car res)) (cdr res))))) (`or (push `(or (seq ,@(nreverse res))) stack) (setq res nil)) (`charset (push (lex--parse-charset (substring string (1+ i) (1- (cadr tmp)))) res)) (`repeat ;; Here we would like to have sub-matches :-( (let* ((min (string-to-number (substring string (+ i (if (eq (aref string i) ?\\) 2 1)) (cadr tmp)))) (max (let ((comma (string-match "," string i))) (if (not (and comma (< comma (cadr tmp)))) min (if (= comma (- (cadr tmp) 2)) nil (string-to-number (substring string (1+ comma)))))))) (if (null res) (error "Non-prefixed repeat operator") (setq res (cons `(repeat ,min ,max ,(car res)) (cdr res)))))) (`stop (setq op 'stop)) ((or `syntax `category `not-syntax `not-category) (push (list (car tmp) (aref string (1- (cadr tmp)))) res)) (`backref (push (list (car tmp) (- (aref string (1- (cadr tmp))) ?0)) res)) (`char (push (aref string (1- (cadr tmp))) res)) (_ (push (car tmp) res))) (setq i (cadr tmp))) (let ((re `(seq ,@(nreverse res)))) (while stack (setq re (nconc (pop stack) (list re)))) (list re i)))) ;;;###autoload (defun lex-parse-re (string &optional lexer) "Parse STRING as a regular expression. LEXER specifies the regexp syntax to use. It can be `ere', or `bre' and it defaults to `bre'." (setq lexer (cond ((eq lexer 'ere) lex--parse-ere-lexer) ((memq lexer '(bre re nil)) lex--parse-bre-lexer) (t lexer))) (let ((res (lex--parse-re string 0 lexer))) (if (< (cadr res) (length string)) (error "Regexp parsing failed around %d: ...%s..." (cadr res) (substring string (1- (cadr res)) (1+ (cadr res)))) (car res)))) ;; (defun lex--parse-re (string i) ;; (let ((stack ()) ;; (op nil) ;; (res nil)) ;; (while (and (not (eq op 'stop))) ;; (lex-case string i ;; ("(?:" ;shy-group. ;; (let ((tmp (lex--parse-re string i))) ;; (setq i (car tmp)) ;; (unless (eq (aref string (1- i)) ?\)) (error "Unclosed shy-group")) ;; (push (cdr tmp) res))) ;; ((or "*?" "+?" "??") ;; (error "Greediness control unsupported `%s'" (match-string 0 string))) ;; ((or "*" "+" "?") ;; (if (null res) (error "Non-prefixed suffix operator") ;; (setq res (cons (list (cdr (assq (aref string (1- i)) ;; '((?* . 0+) ;; (?+ . 1+) ;; (?? . opt)))) ;; (car res)) ;; (cdr res))))) ;; ("|" (push `(or (seq ,@(nreverse res))) stack) ;; (setq res nil)) ;; ((seq "[" (opt "^") (opt "]") ;; (0+ (or (seq (char not ?\]) "-" (char not ?\])) ;; (seq "[:" (1+ (char (?a . ?z) (?A . ?Z))) ":]") ;; (char not ?\]))) "]") ;; (push (lex--parse-charset ;; (substring string (1+ (match-beginning 0)) ;; (1- (match-end 0)))) ;; res)) ;; ((seq "{" (0+ (char (?0 . ?9))) ;; (opt (seq "," (0+ (char (?0 . ?9))))) "}") ;; ;; Here we would like to have sub-matches :-( ;; (let* ((min (string-to-number (substring string ;; (1+ (match-beginning 0)) ;; (match-end 0)))) ;; (max (let ((comma (string-match "," string (match-beginning 0)))) ;; (if (not (and comma (< comma (match-end 0)))) ;; min ;; (if (= comma (- (match-end 0) 2)) ;; nil ;; (string-to-number (substring string (1+ comma)))))))) ;; (if (null res) (error "Non-prefixed repeat operator") ;; (setq res (cons `(repeat ,min ,max ,(car res)) (cdr res)))))) ;; ((or ")" eob) (setq op 'stop)) ;; ("\\'" (push 'eob res)) ;; ("\\`" (push 'bob res)) ;; ("^" (push 'bol res)) ;; ("$" (push 'eol res)) ;; ("." (push 'dot res)) ;; ((or "(" "\\<" "\\>" "\\_<" "\\_>" "\\c" "\\s" "\\C" "\\S" "\\w" "\\W" ;; "\\b" "\\B" "\\=" (seq "\\" (char (?1 . ?9)))) ;; (error "Unsupported construct `%s'" (match-string 0 string))) ;; ((or (seq ?\\ anything) anything) ;; (push (aref string (1- (match-end 0))) res)) ;; ("" (error "This should not be reachable")))) ;; (let ((re `(seq ,@(nreverse res)))) ;; (while stack (setq re (nconc (pop stack) (list re)))) ;; (cons i re)))) (provide 'lex-parse-re) ;;; lex-parse-re.el ends here