;;; xml.el --- XML parser
-;; Copyright (C) 2000-2012 Free Software Foundation, Inc.
+;; Copyright (C) 2000-2013 Free Software Foundation, Inc.
;; Author: Emmanuel Briot <briot@gnat.com>
;; Maintainer: Mark A. Hershberger <mah@everybody.org>
;; a worthwhile tradeoff especially since we're usually parsing files
;; instead of hand-crafted XML.
-;;*******************************************************************
-;;**
-;;** Macros to parse the list
-;;**
-;;*******************************************************************
+;;; Macros to parse the list
(defconst xml-undefined-entity "?"
"What to substitute for undefined entities")
+(defconst xml-default-ns '(("" . "")
+ ("xml" . "http://www.w3.org/XML/1998/namespace")
+ ("xmlns" . "http://www.w3.org/2000/xmlns/"))
+ "Alist mapping default XML namespaces to their URIs.")
+
(defvar xml-entity-alist
- '(("lt" . "<")
+ '(("lt" . "<")
("gt" . ">")
("apos" . "'")
("quot" . "\"")
- ("amp" . "&"))
- "Alist of defined XML entities.")
+ ("amp" . "&"))
+ "Alist mapping XML entities to their replacement text.")
+
+(defvar xml-entity-expansion-limit 20000
+ "The maximum size of entity reference expansions.
+If the size of the buffer increases by this many characters while
+expanding entity references in a segment of character data, the
+XML parser signals an error. Setting this to nil removes the
+limit (making the parser vulnerable to XML bombs).")
(defvar xml-parameter-entity-alist nil
"Alist of defined XML parametric entities.")
would be represented by
- '(\"\" . \"foo\")."
+ '(\"\" . \"foo\").
+
+If you'd just like a plain symbol instead, use 'symbol-qnames in
+the PARSE-NS argument."
(car node))
See also `xml-get-attribute-or-nil'."
(or (xml-get-attribute-or-nil node attribute) ""))
-;;*******************************************************************
-;;**
-;;** Creating the list
-;;**
-;;*******************************************************************
-
-;;;###autoload
-(defun xml-parse-file (file &optional parse-dtd parse-ns)
- "Parse the well-formed XML file FILE.
-Return the top node with all its children.
-If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
-If PARSE-NS is non-nil, then QNAMES are expanded."
- (with-temp-buffer
- (insert-file-contents file)
- (xml--parse-buffer parse-dtd parse-ns)))
+;;; Regular expressions for XML components
+;; The following regexps are used as subexpressions in regexps that
+;; are `eval-when-compile'd for efficiency, so they must be defined at
+;; compile time.
(eval-and-compile
-(let* ((start-chars (concat "[:alpha:]:_"))
- (name-chars (concat "-[:digit:]." start-chars))
- ;;[3] S ::= (#x20 | #x9 | #xD | #xA)+
- (whitespace "[ \t\n\r]"))
- ;; [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6]
- ;; | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
- ;; | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF]
- ;; | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD]
- ;; | [#x10000-#xEFFFF]
- (defconst xml-name-start-char-re (concat "[" start-chars "]"))
- ;; [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7
- ;; | [#x0300-#x036F] | [#x203F-#x2040]
- (defconst xml-name-char-re (concat "[" name-chars "]"))
- ;; [5] Name ::= NameStartChar (NameChar)*
- (defconst xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
- ;; [6] Names ::= Name (#x20 Name)*
- (defconst xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
- ;; [7] Nmtoken ::= (NameChar)+
- (defconst xml-nmtoken-re (concat xml-name-char-re "+"))
- ;; [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
- (defconst xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
- ;; [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
- (defconst xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
- ;; [68] EntityRef ::= '&' Name ';'
- (defconst xml-entity-ref (concat "&" xml-name-re ";"))
- ;; [69] PEReference ::= '%' Name ';'
- (defconst xml-pe-reference-re (concat "%" xml-name-re ";"))
- ;; [67] Reference ::= EntityRef | CharRef
- (defconst xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
- ;; [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
- (defconst xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|" xml-reference-re "\\)*\"\\|"
- "'\\(?:[^&']\\|" xml-reference-re "\\)*'\\)"))
- ;; [56] TokenizedType ::= 'ID' [VC: ID] [VC: One ID / Element Type] [VC: ID Attribute Default]
- ;; | 'IDREF' [VC: IDREF]
- ;; | 'IDREFS' [VC: IDREF]
- ;; | 'ENTITY' [VC: Entity Name]
- ;; | 'ENTITIES' [VC: Entity Name]
- ;; | 'NMTOKEN' [VC: Name Token]
- ;; | 'NMTOKENS' [VC: Name Token]
- (defconst xml-tokenized-type-re (concat "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|"
- "ENTITIES\\|NMTOKEN\\|NMTOKENS\\)"))
- ;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
- (defconst xml-notation-type-re
- (concat "\\(?:NOTATION" whitespace "(" whitespace "*" xml-name-re
- "\\(?:" whitespace "*|" whitespace "*" xml-name-re "\\)*"
- whitespace "*)\\)"))
- ;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
- ;; [VC: Enumeration] [VC: No Duplicate Tokens]
- (defconst xml-enumeration-re (concat "\\(?:(" whitespace "*" xml-nmtoken-re
- "\\(?:" whitespace "*|" whitespace "*"
- xml-nmtoken-re "\\)*"
- whitespace ")\\)"))
- ;; [57] EnumeratedType ::= NotationType | Enumeration
- (defconst xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re
- "\\|" xml-enumeration-re "\\)"))
- ;; [54] AttType ::= StringType | TokenizedType | EnumeratedType
- ;; [55] StringType ::= 'CDATA'
- (defconst xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re
- "\\|" xml-notation-type-re
- "\\|" xml-enumerated-type-re "\\)"))
- ;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
- (defconst xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|\\(?:#FIXED"
- whitespace "\\)*" xml-att-value-re "\\)"))
- ;; [53] AttDef ::= S Name S AttType S DefaultDecl
- (defconst xml-att-def-re (concat "\\(?:" whitespace "*" xml-name-re
- whitespace "*" xml-att-type-re
- whitespace "*" xml-default-decl-re "\\)"))
- ;; [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
- ;; | "'" ([^%&'] | PEReference | Reference)* "'"
- (defconst xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|" xml-pe-reference-re
- "\\|" xml-reference-re
- "\\)*\"\\|'\\(?:[^%&']\\|"
- xml-pe-reference-re "\\|"
- xml-reference-re "\\)*'\\)"))))
+
+;; [4] NameStartChar
+;; See the definition of word syntax in `xml-syntax-table'.
+(defconst xml-name-start-char-re (concat "[[:word:]:_]"))
+
+;; [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7
+;; | [#x0300-#x036F] | [#x203F-#x2040]
+(defconst xml-name-char-re (concat "[-0-9.[:word:]:_·̀-ͯ‿-⁀]"))
+
+;; [5] Name ::= NameStartChar (NameChar)*
+(defconst xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
+
+;; [6] Names ::= Name (#x20 Name)*
+(defconst xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
+
+;; [7] Nmtoken ::= (NameChar)+
+(defconst xml-nmtoken-re (concat xml-name-char-re "+"))
+
+;; [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
+(defconst xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
+
+;; [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
+(defconst xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
+
+;; [68] EntityRef ::= '&' Name ';'
+(defconst xml-entity-ref (concat "&" xml-name-re ";"))
+
+(defconst xml-entity-or-char-ref-re (concat "&\\(?:#\\(x\\)?\\([0-9a-fA-F]+\\)\\|\\("
+ xml-name-re "\\)\\);"))
+
+;; [69] PEReference ::= '%' Name ';'
+(defconst xml-pe-reference-re (concat "%\\(" xml-name-re "\\);"))
+
+;; [67] Reference ::= EntityRef | CharRef
+(defconst xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
+
+;; [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
+;; | "'" ([^<&'] | Reference)* "'"
+(defconst xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|"
+ xml-reference-re "\\)*\"\\|"
+ "'\\(?:[^&']\\|" xml-reference-re
+ "\\)*'\\)"))
+
+;; [56] TokenizedType ::= 'ID'
+;; [VC: ID] [VC: One ID / Element Type] [VC: ID Attribute Default]
+;; | 'IDREF' [VC: IDREF]
+;; | 'IDREFS' [VC: IDREF]
+;; | 'ENTITY' [VC: Entity Name]
+;; | 'ENTITIES' [VC: Entity Name]
+;; | 'NMTOKEN' [VC: Name Token]
+;; | 'NMTOKENS' [VC: Name Token]
+(defconst xml-tokenized-type-re (concat "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|"
+ "ENTITIES\\|NMTOKEN\\|NMTOKENS\\)"))
+
+;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
+(defconst xml-notation-type-re
+ (concat "\\(?:NOTATION\\s-+(\\s-*" xml-name-re
+ "\\(?:\\s-*|\\s-*" xml-name-re "\\)*\\s-*)\\)"))
+
+;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
+;; [VC: Enumeration] [VC: No Duplicate Tokens]
+(defconst xml-enumeration-re (concat "\\(?:(\\s-*" xml-nmtoken-re
+ "\\(?:\\s-*|\\s-*" xml-nmtoken-re
+ "\\)*\\s-+)\\)"))
+
+;; [57] EnumeratedType ::= NotationType | Enumeration
+(defconst xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re
+ "\\|" xml-enumeration-re "\\)"))
+
+;; [54] AttType ::= StringType | TokenizedType | EnumeratedType
+;; [55] StringType ::= 'CDATA'
+(defconst xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re
+ "\\|" xml-notation-type-re
+ "\\|" xml-enumerated-type-re "\\)"))
+
+;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
+(defconst xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|"
+ "\\(?:#FIXED\\s-+\\)*"
+ xml-att-value-re "\\)"))
+
+;; [53] AttDef ::= S Name S AttType S DefaultDecl
+(defconst xml-att-def-re (concat "\\(?:\\s-*" xml-name-re
+ "\\s-*" xml-att-type-re
+ "\\s-*" xml-default-decl-re "\\)"))
+
+;; [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
+;; | "'" ([^%&'] | PEReference | Reference)* "'"
+(defconst xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|"
+ xml-pe-reference-re
+ "\\|" xml-reference-re
+ "\\)*\"\\|'\\(?:[^%&']\\|"
+ xml-pe-reference-re "\\|"
+ xml-reference-re "\\)*'\\)"))
+) ; End of `eval-when-compile'
+
;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral
;; | 'PUBLIC' S PubidLiteral S SystemLiteral
;; Note that this is setup so that we can do whitespace-skipping with
;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
-;; compared with `re-search-forward', but that has been fixed. Also
-;; note that the standard syntax table contains other characters with
-;; whitespace syntax, like NBSP, but they are invalid in contexts in
-;; which we might skip whitespace -- specifically, they're not
-;; NameChars [XML 4].
+;; compared with `re-search-forward', but that has been fixed.
(defvar xml-syntax-table
- (let ((table (make-syntax-table)))
- ;; Get space syntax correct per XML [3].
- (dotimes (c 31)
- (modify-syntax-entry c "." table)) ; all are space in standard table
- (dolist (c '(?\t ?\n ?\r)) ; these should be space
+ ;; By default, characters have symbol syntax.
+ (let ((table (make-char-table 'syntax-table '(3))))
+ ;; The XML space chars [3], and nothing else, have space syntax.
+ (dolist (c '(?\s ?\t ?\r ?\n))
(modify-syntax-entry c " " table))
- ;; For skipping attributes.
- (modify-syntax-entry ?\" "\"" table)
- (modify-syntax-entry ?' "\"" table)
- ;; Non-alnum name chars should be symbol constituents (`-' and `_'
- ;; are OK by default).
- (modify-syntax-entry ?. "_" table)
- (modify-syntax-entry ?: "_" table)
- ;; XML [89]
- (unless (featurep 'xemacs)
- (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005
- #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC
- #x30FD #x30FE))
- (modify-syntax-entry (decode-char 'ucs c) "w" table)))
- ;; Fixme: rest of [4]
+ ;; The characters in NameStartChar [4], aside from ':' and '_',
+ ;; have word syntax. This is used by `xml-name-start-char-re'.
+ (modify-syntax-entry '(?A . ?Z) "w" table)
+ (modify-syntax-entry '(?a . ?z) "w" table)
+ (modify-syntax-entry '(#xC0 . #xD6) "w" table)
+ (modify-syntax-entry '(#xD8 . #XF6) "w" table)
+ (modify-syntax-entry '(#xF8 . #X2FF) "w" table)
+ (modify-syntax-entry '(#x370 . #X37D) "w" table)
+ (modify-syntax-entry '(#x37F . #x1FFF) "w" table)
+ (modify-syntax-entry '(#x200C . #x200D) "w" table)
+ (modify-syntax-entry '(#x2070 . #x218F) "w" table)
+ (modify-syntax-entry '(#x2C00 . #x2FEF) "w" table)
+ (modify-syntax-entry '(#x3001 . #xD7FF) "w" table)
+ (modify-syntax-entry '(#xF900 . #xFDCF) "w" table)
+ (modify-syntax-entry '(#xFDF0 . #xFFFD) "w" table)
+ (modify-syntax-entry '(#x10000 . #xEFFFF) "w" table)
table)
- "Syntax table used by `xml-parse-region'.")
+ "Syntax table used by the XML parser.
+In this syntax table, the XML space characters [ \\t\\r\\n], and
+only those characters, have whitespace syntax.")
-;; XML [5]
-;; Note that [:alpha:] matches all multibyte chars with word syntax.
-(eval-and-compile
- (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*"))
+;;; Entry points:
-;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
-;; document ::= prolog element Misc*
-;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
+;;;###autoload
+(defun xml-parse-file (file &optional parse-dtd parse-ns)
+ "Parse the well-formed XML file FILE.
+Return the top node with all its children.
+If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
+
+If PARSE-NS is non-nil, then QNAMES are expanded. By default,
+the variable `xml-default-ns' is the mapping from namespaces to
+URIs, and expanded names will be returned as a cons
+
+ (\"namespace:\" . \"foo\").
+
+If PARSE-NS is an alist, it will be used as the mapping from
+namespace to URIs instead.
+
+If it is the symbol 'symbol-qnames, expanded names will be
+returned as a plain symbol 'namespace:foo instead of a cons.
+
+Both features can be combined by providing a cons cell
+
+ (symbol-qnames . ALIST)."
+ (with-temp-buffer
+ (insert-file-contents file)
+ (xml--parse-buffer parse-dtd parse-ns)))
;;;###autoload
-(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
+(defun xml-parse-region (&optional beg end buffer parse-dtd parse-ns)
"Parse the region from BEG to END in BUFFER.
+Return the XML parse tree, or raise an error if the region does
+not contain well-formed XML.
+
+If BEG is nil, it defaults to `point-min'.
+If END is nil, it defaults to `point-max'.
If BUFFER is nil, it defaults to the current buffer.
-Returns the XML list for the region, or raises an error if the region
-is not well-formed XML.
-If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
-and returned as the first element of the list.
-If PARSE-NS is non-nil, then QNAMES are expanded."
+If PARSE-DTD is non-nil, parse the DTD and return it as the first
+element of the list.
+If PARSE-NS is non-nil, then QNAMES are expanded. By default,
+the variable `xml-default-ns' is the mapping from namespaces to
+URIs, and expanded names will be returned as a cons
+
+ (\"namespace:\" . \"foo\").
+
+If PARSE-NS is an alist, it will be used as the mapping from
+namespace to URIs instead.
+
+If it is the symbol 'symbol-qnames, expanded names will be
+returned as a plain symbol 'namespace:foo instead of a cons.
+
+Both features can be combined by providing a cons cell
+
+ (symbol-qnames . ALIST)."
;; Use fixed syntax table to ensure regexp char classes and syntax
;; specs DTRT.
(unless buffer
(setq buffer (current-buffer)))
(with-temp-buffer
- (insert-buffer-substring buffer beg end)
+ (insert-buffer-substring-no-properties buffer beg end)
(xml--parse-buffer parse-dtd parse-ns)))
+;; XML [5]
+
+;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
+;; document ::= prolog element Misc*
+;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
+
(defun xml--parse-buffer (parse-dtd parse-ns)
- (with-syntax-table (standard-syntax-table)
+ (with-syntax-table xml-syntax-table
(let ((case-fold-search nil) ; XML is case-sensitive.
;; Prevent entity definitions from changing the defaults
(xml-entity-alist xml-entity-alist)
(if (search-forward "<" nil t)
(progn
(forward-char -1)
- (setq result (xml-parse-tag parse-dtd parse-ns))
+ (setq result (xml-parse-tag-1 parse-dtd parse-ns))
(cond
((null result)
;; Not looking at an xml start tag.
During namespace-aware parsing, any name without a namespace is
put into the namespace identified by DEFAULT. nil is used to
-specify that the name shouldn't be given a namespace."
+specify that the name shouldn't be given a namespace.
+Expanded names will by default be returned as a cons. If you
+would like to get plain symbols instead, provide a cons cell
+
+ (symbol-qnames . ALIST)
+
+in the XML-NS argument."
(if (consp xml-ns)
- (let* ((nsp (string-match ":" name))
+ (let* ((symbol-qnames (eq (car-safe xml-ns) 'symbol-qnames))
+ (nsp (string-match ":" name))
(lname (if nsp (substring name (match-end 0)) name))
(prefix (if nsp (substring name 0 (match-beginning 0)) default))
(special (and (string-equal lname "xmlns") (not prefix)))
;; Setting default to nil will insure that there is not
;; matching cons in xml-ns. In which case we
(ns (or (cdr (assoc (if special "xmlns" prefix)
- xml-ns))
+ (if symbol-qnames (cdr xml-ns) xml-ns)))
"")))
- (cons ns (if special "" lname)))
+ (if (and symbol-qnames
+ (not (string= prefix "xmlns")))
+ (intern (concat ns lname))
+ (cons ns (if special "" lname))))
(intern name)))
-(defun xml-parse-fragment (&optional parse-dtd parse-ns)
- "Parse xml-like fragments."
- (let ((xml-sub-parser t)
- ;; Prevent entity definitions from changing the defaults
- (xml-entity-alist xml-entity-alist)
- (xml-parameter-entity-alist xml-parameter-entity-alist)
- children)
- (while (not (eobp))
- (let ((bit (xml-parse-tag
- parse-dtd parse-ns)))
- (if children
- (setq children (append (list bit) children))
- (if (stringp bit)
- (setq children (list bit))
- (setq children bit)))))
- (reverse children)))
-
(defun xml-parse-tag (&optional parse-dtd parse-ns)
"Parse the tag at point.
If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
returned as the first element in the list.
-If PARSE-NS is non-nil, then QNAMES are expanded.
-Returns one of:
+If PARSE-NS is non-nil, expand QNAMES; for further details, see
+`xml-parse-region'.
+
+Return one of:
- a list : the matching node
- nil : the point is not looking at a tag.
- a pair : the first element is the DTD, the second is the node."
- (let ((xml-validating-parser (or parse-dtd xml-validating-parser))
- (xml-ns (if (consp parse-ns)
- parse-ns
- (if parse-ns
- (list
- ;; Default for empty prefix is no namespace
- (cons "" "")
- ;; "xml" namespace
- (cons "xml" "http://www.w3.org/XML/1998/namespace")
- ;; We need to seed the xmlns namespace
- (cons "xmlns" "http://www.w3.org/2000/xmlns/"))))))
+ (let* ((case-fold-search nil)
+ ;; Prevent entity definitions from changing the defaults
+ (xml-entity-alist xml-entity-alist)
+ (xml-parameter-entity-alist xml-parameter-entity-alist)
+ (buf (current-buffer))
+ (pos (point)))
+ (with-temp-buffer
+ (with-syntax-table xml-syntax-table
+ (insert-buffer-substring-no-properties buf pos)
+ (goto-char (point-min))
+ (xml-parse-tag-1 parse-dtd parse-ns)))))
+
+(defun xml-parse-tag-1 (&optional parse-dtd parse-ns)
+ "Like `xml-parse-tag', but possibly modify the buffer while working."
+ (let* ((xml-validating-parser (or parse-dtd xml-validating-parser))
+ (xml-ns
+ (cond ((eq parse-ns 'symbol-qnames)
+ (cons 'symbol-qnames xml-default-ns))
+ ((or (consp (car-safe parse-ns))
+ (and (eq (car-safe parse-ns) 'symbol-qnames)
+ (listp (cdr parse-ns))))
+ parse-ns)
+ (parse-ns
+ xml-default-ns))))
(cond
- ;; Processing instructions (like the <?xml version="1.0"?> tag at the
- ;; beginning of a document).
+ ;; Processing instructions, like <?xml version="1.0"?>.
((looking-at "<\\?")
(search-forward "?>")
(skip-syntax-forward " ")
- (xml-parse-tag parse-dtd xml-ns))
- ;; Character data (CDATA) sections, in which no tag should be interpreted
+ (xml-parse-tag-1 parse-dtd xml-ns))
+ ;; Character data (CDATA) sections, in which no tag should be interpreted
((looking-at "<!\\[CDATA\\[")
(let ((pos (match-end 0)))
(unless (search-forward "]]>" nil t)
(concat
(buffer-substring-no-properties pos (match-beginning 0))
(xml-parse-string))))
- ;; DTD for the document
+ ;; DTD for the document
((looking-at "<!DOCTYPE[ \t\n\r]")
(let ((dtd (xml-parse-dtd parse-ns)))
(skip-syntax-forward " ")
(if xml-validating-parser
- (cons dtd (xml-parse-tag nil xml-ns))
- (xml-parse-tag nil xml-ns))))
- ;; skip comments
+ (cons dtd (xml-parse-tag-1 nil xml-ns))
+ (xml-parse-tag-1 nil xml-ns))))
+ ;; skip comments
((looking-at "<!--")
(search-forward "-->")
+ ;; FIXME: This loses the skipped-over spaces.
(skip-syntax-forward " ")
(unless (eobp)
(let ((xml-sub-parser t))
- (xml-parse-tag parse-dtd xml-ns))))
- ;; end tag
+ (xml-parse-tag-1 parse-dtd xml-ns))))
+ ;; end tag
((looking-at "</")
'())
- ;; opening tag
- ((looking-at "<\\([^/>[:space:]]+\\)")
+ ;; opening tag
+ ((looking-at (eval-when-compile (concat "<\\(" xml-name-re "\\)")))
(goto-char (match-end 1))
-
;; Parse this node
(let* ((node-name (match-string-no-properties 1))
;; Parse the attribute list.
(attrs (xml-parse-attlist xml-ns))
children)
-
;; add the xmlns:* attrs to our cache
(when (consp xml-ns)
(dolist (attr attrs)
(equal "http://www.w3.org/2000/xmlns/"
(caar attr)))
(push (cons (cdar attr) (cdr attr))
- xml-ns))))
-
+ (if (symbolp (car xml-ns))
+ (cdr xml-ns)
+ xml-ns)))))
(setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
+ (cond
+ ;; is this an empty element ?
+ ((looking-at "/>")
+ (forward-char 2)
+ (nreverse children))
+ ;; is this a valid start tag ?
+ ((eq (char-after) ?>)
+ (forward-char 1)
+ ;; Now check that we have the right end-tag.
+ (let ((end (concat "</" node-name "\\s-*>")))
+ (while (not (looking-at end))
+ (cond
+ ((eobp)
+ (error "XML: (Not Well-Formed) End of document while reading element `%s'"
+ node-name))
+ ((looking-at "</")
+ (forward-char 2)
+ (error "XML: (Not Well-Formed) Invalid end tag `%s' (expecting `%s')"
+ (let ((pos (point)))
+ (buffer-substring pos (if (re-search-forward "\\s-*>" nil t)
+ (match-beginning 0)
+ (point-max))))
+ node-name))
+ ;; Read a sub-element and push it onto CHILDREN.
+ ((= (char-after) ?<)
+ (let ((tag (xml-parse-tag-1 nil xml-ns)))
+ (when tag
+ (push tag children))))
+ ;; Read some character data.
+ (t
+ (let ((expansion (xml-parse-string)))
+ (push (if (stringp (car children))
+ ;; If two strings were separated by a
+ ;; comment, concat them.
+ (concat (pop children) expansion)
+ expansion)
+ children)))))
+ ;; Move point past the end-tag.
+ (goto-char (match-end 0))
+ (nreverse children)))
+ ;; Otherwise this was an invalid start tag (expected ">" not found.)
+ (t
+ (error "XML: (Well-Formed) Couldn't parse tag: %s"
+ (buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
- ;; is this an empty element ?
- (if (looking-at "/>")
- (progn
- (forward-char 2)
- (nreverse children))
-
- ;; is this a valid start tag ?
- (if (eq (char-after) ?>)
- (progn
- (forward-char 1)
- ;; Now check that we have the right end-tag. Note that this
- ;; one might contain spaces after the tag name
- (let ((end (concat "</" node-name "\\s-*>")))
- (while (not (looking-at end))
- (cond
- ((looking-at "</")
- (error "XML: (Not Well-Formed) Invalid end tag (expecting %s) at pos %d"
- node-name (point)))
- ((= (char-after) ?<)
- (let ((tag (xml-parse-tag nil xml-ns)))
- (when tag
- (push tag children))))
- (t
- (let ((expansion (xml-parse-string)))
- (setq children
- (if (stringp expansion)
- (if (stringp (car children))
- ;; The two strings were separated by a comment.
- (setq children (append (list (concat (car children) expansion))
- (cdr children)))
- (setq children (append (list expansion) children)))
- (setq children (append expansion children))))))))
-
- (goto-char (match-end 0))
- (nreverse children)))
- ;; This was an invalid start tag (Expected ">", but didn't see it.)
- (error "XML: (Well-Formed) Couldn't parse tag: %s"
- (buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
- (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
- (unless xml-sub-parser ; Usually, we error out.
+ ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
+ (t
+ (unless xml-sub-parser ; Usually, we error out.
(error "XML: (Well-Formed) Invalid character"))
-
;; However, if we're parsing incrementally, then we need to deal
;; with stray CDATA.
(xml-parse-string)))))
(defun xml-parse-string ()
- "Parse the next whatever. Could be a string, or an element."
- (let* ((pos (point))
- (string (progn (skip-chars-forward "^<")
- (buffer-substring-no-properties pos (point)))))
- ;; Clean up the string. As per XML specifications, the XML
- ;; processor should always pass the whole string to the
- ;; application. But \r's should be replaced:
- ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
- (setq pos 0)
- (while (string-match "\r\n?" string pos)
- (setq string (replace-match "\n" t t string))
- (setq pos (1+ (match-beginning 0))))
-
- (xml-substitute-special string)))
+ "Parse character data at point, and return it as a string.
+Leave point at the start of the next thing to parse. This
+function can modify the buffer by expanding entity and character
+references."
+ (let ((start (point))
+ ;; Keep track of the size of the rest of the buffer:
+ (old-remaining-size (- (buffer-size) (point)))
+ ref val)
+ (while (and (not (eobp))
+ (not (looking-at "<")))
+ ;; Find the next < or & character.
+ (skip-chars-forward "^<&")
+ (when (eq (char-after) ?&)
+ ;; If we find an entity or character reference, expand it.
+ (unless (looking-at xml-entity-or-char-ref-re)
+ (error "XML: (Not Well-Formed) Invalid entity reference"))
+ ;; For a character reference, the next entity or character
+ ;; reference must be after the replacement. [4.6] "Numerical
+ ;; character references are expanded immediately when
+ ;; recognized and MUST be treated as character data."
+ (if (setq ref (match-string 2))
+ (progn ; Numeric char reference
+ (setq val (save-match-data
+ (decode-char 'ucs (string-to-number
+ ref (if (match-string 1) 16)))))
+ (and (null val)
+ xml-validating-parser
+ (error "XML: (Validity) Invalid character reference `%s'"
+ (match-string 0)))
+ (replace-match (if val (string val) xml-undefined-entity) t t))
+ ;; For an entity reference, search again from the start of
+ ;; the replaced text, since the replacement can contain
+ ;; entity or character references, or markup.
+ (setq ref (match-string 3)
+ val (assoc ref xml-entity-alist))
+ (and (null val)
+ xml-validating-parser
+ (error "XML: (Validity) Undefined entity `%s'" ref))
+ (replace-match (or (cdr val) xml-undefined-entity) t t)
+ (goto-char (match-beginning 0)))
+ ;; Check for XML bombs.
+ (and xml-entity-expansion-limit
+ (> (- (buffer-size) (point))
+ (+ old-remaining-size xml-entity-expansion-limit))
+ (error "XML: Entity reference expansion \
+surpassed `xml-entity-expansion-limit'"))))
+ ;; [2.11] Clean up line breaks.
+ (let ((end-marker (point-marker)))
+ (goto-char start)
+ (while (re-search-forward "\r\n?" end-marker t)
+ (replace-match "\n" t t))
+ (goto-char end-marker)
+ (buffer-substring start (point)))))
(defun xml-parse-attlist (&optional xml-ns)
"Return the attribute-list after point.
end-pos name)
(skip-syntax-forward " ")
(while (looking-at (eval-when-compile
- (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
+ (concat "\\(" xml-name-re "\\)\\s-*=\\s-*")))
(setq end-pos (match-end 0))
(setq name (xml-maybe-do-ns (match-string-no-properties 1) nil xml-ns))
(goto-char end-pos)
(replace-regexp-in-string "\\s-\\{2,\\}" " " string)
(let ((expansion (xml-substitute-special string)))
(unless (stringp expansion)
- ; We say this is the constraint. It is actually that neither
- ; external entities nor "<" can be in an attribute value.
+ ;; We say this is the constraint. It is actually that
+ ;; neither external entities nor "<" can be in an
+ ;; attribute value.
(error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements"))
(push (cons name expansion) attlist)))
(skip-syntax-forward " "))
(nreverse attlist)))
-;;*******************************************************************
-;;**
-;;** The DTD (document type declaration)
-;;** The following functions know how to skip or parse the DTD of
-;;** a document
-;;**
-;;*******************************************************************
+;;; DTD (document type declaration)
-;; Fixme: This fails at least if the DTD contains conditional sections.
+;; The following functions know how to skip or parse the DTD of a
+;; document. FIXME: it fails at least if the DTD contains conditional
+;; sections.
(defun xml-skip-dtd ()
"Skip the DTD at point.
(error "XML: (Validity) Invalid DTD (expecting name of the document)"))
;; Get the name of the document
- (looking-at xml-name-regexp)
+ (looking-at xml-name-re)
(let ((dtd (list (match-string-no-properties 0) 'dtd))
(xml-parameter-entity-alist xml-parameter-entity-alist)
- (parameter-entity-re (eval-when-compile
- (concat "%\\(" xml-name-re "\\);")))
next-parameter-entity)
(goto-char (match-end 0))
(skip-syntax-forward " ")
;; and try again.
(setq next-parameter-entity
(save-excursion
- (if (re-search-forward parameter-entity-re nil t)
+ (if (re-search-forward xml-pe-reference-re nil t)
(match-beginning 0))))
;; Parse the rest of the DTD
(while (not (looking-at "\\s-*\\]"))
(skip-syntax-forward " ")
(cond
+ ((eobp)
+ (error "XML: (Well-Formed) End of document while reading DTD"))
;; Element declaration [45]:
((and (looking-at (eval-when-compile
(concat "<!ELEMENT\\s-+\\(" xml-name-re
(> (point) next-parameter-entity)
(setq next-parameter-entity
(save-excursion
- (if (re-search-forward parameter-entity-re nil t)
+ (if (re-search-forward xml-pe-reference-re nil t)
(match-beginning 0))))))
;; Internal entity declarations:
(next-parameter-entity
(save-excursion
(goto-char next-parameter-entity)
- (unless (looking-at parameter-entity-re)
+ (unless (looking-at xml-pe-reference-re)
(error "XML: Internal error"))
(let* ((entity (match-string 1))
(beg (point-marker))
(goto-char next-parameter-entity))
(goto-char (match-end 0))))
(setq next-parameter-entity
- (if (re-search-forward parameter-entity-re nil t)
+ (if (re-search-forward xml-pe-reference-re nil t)
(match-beginning 0)))))
- ;; Anything else:
+ ;; Anything else is garbage (ignored if not validating).
(xml-validating-parser
- (error "XML: (Validity) Invalid DTD item"))))
+ (error "XML: (Validity) Invalid DTD item"))
+ (t
+ (skip-chars-forward "^]"))))
(if (looking-at "\\s-*]>")
(goto-char (match-end 0))))
;; Parameter entity reference
((setq ref (match-string 3 string))
(setq val (assoc ref xml-parameter-entity-alist))
- (if val
- (push (cdr val) children)
- (push (concat "%" ref ";") children))))
+ (and (null val)
+ xml-validating-parser
+ (error "XML: (Validity) Undefined parameter entity `%s'" ref))
+ (push (or (cdr val) xml-undefined-entity) children)))
(setq string remainder)))
(mapconcat 'identity (nreverse (cons string children)) "")))
(t
elem))))
-;;*******************************************************************
-;;**
-;;** Substituting special XML sequences
-;;**
-;;*******************************************************************
+;;; Substituting special XML sequences
(defun xml-substitute-special (string)
- "Return STRING, after substituting entity references."
- ;; This originally made repeated passes through the string from the
- ;; beginning, which isn't correct, since then either "&amp;" or
- ;; "&amp;" won't DTRT.
-
- (let ((point 0)
- children end-point)
- (while (string-match "&\\([^;]*\\);" string point)
- (setq end-point (match-end 0))
- (let* ((this-part (match-string-no-properties 1 string))
- (prev-part (substring string point (match-beginning 0)))
- (entity (assoc this-part xml-entity-alist))
- (expansion
- (cond ((string-match "#\\([0-9]+\\)" this-part)
- (let ((c (decode-char
- 'ucs
- (string-to-number (match-string-no-properties 1 this-part)))))
- (if c (string c))))
- ((string-match "#x\\([[:xdigit:]]+\\)" this-part)
- (let ((c (decode-char
- 'ucs
- (string-to-number (match-string-no-properties 1 this-part) 16))))
- (if c (string c))))
- (entity
- (cdr entity))
- ((eq (length this-part) 0)
- (error "XML: (Not Well-Formed) No entity given"))
- (t
- (if xml-validating-parser
- (error "XML: (Validity) Undefined entity `%s'"
- this-part)
- xml-undefined-entity)))))
-
- (cond ((null children)
- ;; FIXME: If we have an entity that expands into XML, this won't work.
- (setq children
- (concat prev-part expansion)))
- ((stringp children)
- (if (stringp expansion)
- (setq children (concat children prev-part expansion))
- (setq children (list expansion (concat prev-part children)))))
- ((and (stringp expansion)
- (stringp (car children)))
- (setcar children (concat prev-part expansion (car children))))
- ((stringp expansion)
- (setq children (append (concat prev-part expansion)
- children)))
- ((stringp (car children))
- (setcar children (concat (car children) prev-part))
- (setq children (append expansion children)))
- (t
- (setq children (list expansion
- prev-part
- children))))
- (setq point end-point)))
- (cond ((stringp children)
- (concat children (substring string point)))
- ((stringp (car (last children)))
- (concat (car (last children)) (substring string point)))
- ((null children)
- string)
- (t
- (concat (mapconcat 'identity
- (nreverse children)
- "")
- (substring string point))))))
+ "Return STRING, after substituting entity and character references.
+STRING is assumed to occur in an XML attribute value."
+ (let ((strlen (length string))
+ children)
+ (while (string-match xml-entity-or-char-ref-re string)
+ (push (substring string 0 (match-beginning 0)) children)
+ (let* ((remainder (substring string (match-end 0)))
+ (is-hex (match-string 1 string)) ; Is it a hex numeric reference?
+ (ref (match-string 2 string))) ; Numeric part of reference
+ (if ref
+ ;; [4.6] Character references are included as
+ ;; character data.
+ (let ((val (decode-char 'ucs (string-to-number ref (if is-hex 16)))))
+ (push (cond (val (string val))
+ (xml-validating-parser
+ (error "XML: (Validity) Undefined character `x%s'" ref))
+ (t xml-undefined-entity))
+ children)
+ (setq string remainder
+ strlen (length string)))
+ ;; [4.4.5] Entity references are "included in literal".
+ ;; Note that we don't need do anything special to treat
+ ;; quotes as normal data characters.
+ (setq ref (match-string 3 string)) ; entity name
+ (let ((val (or (cdr (assoc ref xml-entity-alist))
+ (if xml-validating-parser
+ (error "XML: (Validity) Undefined entity `%s'" ref)
+ xml-undefined-entity))))
+ (setq string (concat val remainder)))
+ (and xml-entity-expansion-limit
+ (> (length string) (+ strlen xml-entity-expansion-limit))
+ (error "XML: Passed `xml-entity-expansion-limit' while expanding `&%s;'"
+ ref)))))
+ (mapconcat 'identity (nreverse (cons string children)) "")))
(defun xml-substitute-numeric-entities (string)
"Substitute SGML numeric entities by their respective utf characters.
string)
nil))
-;;*******************************************************************
-;;**
-;;** Printing a tree.
-;;** This function is intended mainly for debugging purposes.
-;;**
-;;*******************************************************************
+;;; Printing a parse tree (mainly for debugging).
(defun xml-debug-print (xml &optional indent-string)
"Outputs the XML in the current buffer.
(defalias 'xml-print 'xml-debug-print)
(defun xml-escape-string (string)
- "Return STRING with entity substitutions made from `xml-entity-alist'."
- (mapconcat (lambda (byte)
- (let ((char (char-to-string byte)))
- (if (rassoc char xml-entity-alist)
- (concat "&" (car (rassoc char xml-entity-alist)) ";")
- char)))
- string ""))
+ "Convert STRING into a string containing valid XML character data.
+Replace occurrences of &<>'\" in STRING with their default XML
+entity references (e.g. replace each & with &).
+
+XML character data must not contain & or < characters, nor the >
+character under some circumstances. The XML spec does not impose
+restriction on \" or ', but we just substitute for these too
+\(as is permitted by the spec)."
+ (with-temp-buffer
+ (insert string)
+ (dolist (substitution '(("&" . "&")
+ ("<" . "<")
+ (">" . ">")
+ ("'" . "'")
+ ("\"" . """)))
+ (goto-char (point-min))
+ (while (search-forward (car substitution) nil t)
+ (replace-match (cdr substitution) t t nil)))
+ (buffer-string)))
(defun xml-debug-print-internal (xml indent-string)
"Outputs the XML tree in the current buffer.