;;; xml.el --- XML parser
-;; Copyright (C) 2000, 01, 03, 2004 Free Software Foundation, Inc.
+;; Copyright (C) 2000-2016 Free Software Foundation, Inc.
;; Author: Emmanuel Briot <briot@gnat.com>
;; Maintainer: Mark A. Hershberger <mah@everybody.org>
;; This file is part of GNU Emacs.
-;; GNU Emacs is free software; you can redistribute it and/or modify
+;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
-;; the Free Software Foundation; either version 2, or (at your option)
-;; any later version.
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
-;; along with GNU Emacs; see the file COPYING. If not, write to the
-;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-;; Boston, MA 02111-1307, USA.
+;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;;; Code:
-;; Note that {buffer-substring,match-string}-no-properties were
-;; formerly used in several places, but that removes composition info.
+;; Note that buffer-substring and match-string were formerly used in
+;; several places, because the -no-properties variants remove
+;; composition info. However, after some discussion on emacs-devel,
+;; the consensus was that the speed of the -no-properties variants was
+;; a worthwhile tradeoff especially since we're usually parsing files
+;; instead of hand-crafted XML.
-;;*******************************************************************
-;;**
-;;** Macros to parse the list
-;;**
-;;*******************************************************************
+;;; Macros to parse the list
+
+(defconst xml-undefined-entity "?"
+ "What to substitute for undefined entities")
+
+(defconst xml-default-ns '(("" . "")
+ ("xml" . "http://www.w3.org/XML/1998/namespace")
+ ("xmlns" . "http://www.w3.org/2000/xmlns/"))
+ "Alist mapping default XML namespaces to their URIs.")
+
+(defvar xml-entity-alist
+ '(("lt" . "<")
+ ("gt" . ">")
+ ("apos" . "'")
+ ("quot" . "\"")
+ ("amp" . "&"))
+ "Alist mapping XML entities to their replacement text.")
+
+(defvar xml-entity-expansion-limit 20000
+ "The maximum size of entity reference expansions.
+If the size of the buffer increases by this many characters while
+expanding entity references in a segment of character data, the
+XML parser signals an error. Setting this to nil removes the
+limit (making the parser vulnerable to XML bombs).")
+
+(defvar xml-parameter-entity-alist nil
+ "Alist of defined XML parametric entities.")
+
+(defvar xml-sub-parser nil
+ "Non-nil when the XML parser is parsing an XML fragment.")
+
+(defvar xml-validating-parser nil
+ "Set to non-nil to get validity checking.")
(defsubst xml-node-name (node)
"Return the tag associated with NODE.
would be represented by
- '(\"\" . \"foo\")."
+ (\"\" . \"foo\").
+
+If you'd just like a plain symbol instead, use `symbol-qnames' in
+the PARSE-NS argument."
(car node))
See also `xml-get-attribute-or-nil'."
(or (xml-get-attribute-or-nil node attribute) ""))
-;;*******************************************************************
-;;**
-;;** Creating the list
-;;**
-;;*******************************************************************
+;;; Regular expressions for XML components
-;;;###autoload
-(defun xml-parse-file (file &optional parse-dtd parse-ns)
- "Parse the well-formed XML file FILE.
-If FILE is already visited, use its buffer and don't kill it.
-Returns the top node with all its children.
-If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
-If PARSE-NS is non-nil, then QNAMES are expanded."
- (let ((keep))
- (if (get-file-buffer file)
- (progn
- (set-buffer (get-file-buffer file))
- (setq keep (point)))
- (let (auto-mode-alist) ; no need for xml-mode
- (find-file file)))
-
- (let ((xml (xml-parse-region (point-min)
- (point-max)
- (current-buffer)
- parse-dtd parse-ns)))
- (if keep
- (goto-char keep)
- (kill-buffer (current-buffer)))
- xml)))
+;; The following regexps are used as subexpressions in regexps that
+;; are `eval-when-compile'd for efficiency, so they must be defined at
+;; compile time.
+(eval-and-compile
+
+;; [4] NameStartChar
+;; See the definition of word syntax in `xml-syntax-table'.
+(defconst xml-name-start-char-re (concat "[[:word:]:_]"))
+
+;; [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7
+;; | [#x0300-#x036F] | [#x203F-#x2040]
+(defconst xml-name-char-re (concat "[-0-9.[:word:]:_·̀-ͯ‿-⁀]"))
+
+;; [5] Name ::= NameStartChar (NameChar)*
+(defconst xml-name-re (concat xml-name-start-char-re xml-name-char-re "*"))
+
+;; [6] Names ::= Name (#x20 Name)*
+(defconst xml-names-re (concat xml-name-re "\\(?: " xml-name-re "\\)*"))
+
+;; [7] Nmtoken ::= (NameChar)+
+(defconst xml-nmtoken-re (concat xml-name-char-re "+"))
+
+;; [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
+(defconst xml-nmtokens-re (concat xml-nmtoken-re "\\(?: " xml-name-re "\\)*"))
+
+;; [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
+(defconst xml-char-ref-re "\\(?:&#[0-9]+;\\|&#x[0-9a-fA-F]+;\\)")
+
+;; [68] EntityRef ::= '&' Name ';'
+(defconst xml-entity-ref (concat "&" xml-name-re ";"))
+
+(defconst xml-entity-or-char-ref-re (concat "&\\(?:#\\(x\\)?\\([0-9a-fA-F]+\\)\\|\\("
+ xml-name-re "\\)\\);"))
+
+;; [69] PEReference ::= '%' Name ';'
+(defconst xml-pe-reference-re (concat "%\\(" xml-name-re "\\);"))
+
+;; [67] Reference ::= EntityRef | CharRef
+(defconst xml-reference-re (concat "\\(?:" xml-entity-ref "\\|" xml-char-ref-re "\\)"))
+
+;; [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
+;; | "'" ([^<&'] | Reference)* "'"
+(defconst xml-att-value-re (concat "\\(?:\"\\(?:[^&\"]\\|"
+ xml-reference-re "\\)*\"\\|"
+ "'\\(?:[^&']\\|" xml-reference-re
+ "\\)*'\\)"))
+
+;; [56] TokenizedType ::= 'ID'
+;; [VC: ID] [VC: One ID / Element Type] [VC: ID Attribute Default]
+;; | 'IDREF' [VC: IDREF]
+;; | 'IDREFS' [VC: IDREF]
+;; | 'ENTITY' [VC: Entity Name]
+;; | 'ENTITIES' [VC: Entity Name]
+;; | 'NMTOKEN' [VC: Name Token]
+;; | 'NMTOKENS' [VC: Name Token]
+(defconst xml-tokenized-type-re (concat "\\(?:ID\\|IDREF\\|IDREFS\\|ENTITY\\|"
+ "ENTITIES\\|NMTOKEN\\|NMTOKENS\\)"))
+
+;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
+(defconst xml-notation-type-re
+ (concat "\\(?:NOTATION\\s-+(\\s-*" xml-name-re
+ "\\(?:\\s-*|\\s-*" xml-name-re "\\)*\\s-*)\\)"))
+
+;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
+;; [VC: Enumeration] [VC: No Duplicate Tokens]
+(defconst xml-enumeration-re (concat "\\(?:(\\s-*" xml-nmtoken-re
+ "\\(?:\\s-*|\\s-*" xml-nmtoken-re
+ "\\)*\\s-+)\\)"))
+
+;; [57] EnumeratedType ::= NotationType | Enumeration
+(defconst xml-enumerated-type-re (concat "\\(?:" xml-notation-type-re
+ "\\|" xml-enumeration-re "\\)"))
+
+;; [54] AttType ::= StringType | TokenizedType | EnumeratedType
+;; [55] StringType ::= 'CDATA'
+(defconst xml-att-type-re (concat "\\(?:CDATA\\|" xml-tokenized-type-re
+ "\\|" xml-notation-type-re
+ "\\|" xml-enumerated-type-re "\\)"))
+
+;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
+(defconst xml-default-decl-re (concat "\\(?:#REQUIRED\\|#IMPLIED\\|"
+ "\\(?:#FIXED\\s-+\\)*"
+ xml-att-value-re "\\)"))
+
+;; [53] AttDef ::= S Name S AttType S DefaultDecl
+(defconst xml-att-def-re (concat "\\(?:\\s-*" xml-name-re
+ "\\s-*" xml-att-type-re
+ "\\s-*" xml-default-decl-re "\\)"))
+
+;; [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
+;; | "'" ([^%&'] | PEReference | Reference)* "'"
+(defconst xml-entity-value-re (concat "\\(?:\"\\(?:[^%&\"]\\|"
+ xml-pe-reference-re
+ "\\|" xml-reference-re
+ "\\)*\"\\|'\\(?:[^%&']\\|"
+ xml-pe-reference-re "\\|"
+ xml-reference-re "\\)*'\\)"))
+) ; End of `eval-when-compile'
+
+
+;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral
+;; | 'PUBLIC' S PubidLiteral S SystemLiteral
+;; [76] NDataDecl ::= S 'NDATA' S
+;; [73] EntityDef ::= EntityValue| (ExternalID NDataDecl?)
+;; [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
+;; [74] PEDef ::= EntityValue | ExternalID
+;; [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
+;; [70] EntityDecl ::= GEDecl | PEDecl
;; Note that this is setup so that we can do whitespace-skipping with
;; `(skip-syntax-forward " ")', inter alia. Previously this was slow
-;; compared with `re-search-forward', but that has been fixed. Also
-;; note that the standard syntax table contains other characters with
-;; whitespace syntax, like NBSP, but they are invalid in contexts in
-;; which we might skip whitespace -- specifically, they're not
-;; NameChars [XML 4].
+;; compared with `re-search-forward', but that has been fixed.
(defvar xml-syntax-table
- (let ((table (make-syntax-table)))
- ;; Get space syntax correct per XML [3].
- (dotimes (c 31)
- (modify-syntax-entry c "." table)) ; all are space in standard table
- (dolist (c '(?\t ?\n ?\r)) ; these should be space
+ ;; By default, characters have symbol syntax.
+ (let ((table (make-char-table 'syntax-table '(3))))
+ ;; The XML space chars [3], and nothing else, have space syntax.
+ (dolist (c '(?\s ?\t ?\r ?\n))
(modify-syntax-entry c " " table))
- ;; For skipping attributes.
- (modify-syntax-entry ?\" "\"" table)
- (modify-syntax-entry ?' "\"" table)
- ;; Non-alnum name chars should be symbol constituents (`-' and `_'
- ;; are OK by default).
- (modify-syntax-entry ?. "_" table)
- (modify-syntax-entry ?: "_" table)
- ;; XML [89]
- (dolist (c '(#x00B7 #x02D0 #x02D1 #x0387 #x0640 #x0E46 #x0EC6 #x3005
- #x3031 #x3032 #x3033 #x3034 #x3035 #x309D #x309E #x30FC
- #x30FD #x30FE))
- (modify-syntax-entry (decode-char 'ucs c) "w" table))
- ;; Fixme: rest of [4]
+ ;; The characters in NameStartChar [4], aside from ':' and '_',
+ ;; have word syntax. This is used by `xml-name-start-char-re'.
+ (modify-syntax-entry '(?A . ?Z) "w" table)
+ (modify-syntax-entry '(?a . ?z) "w" table)
+ (modify-syntax-entry '(#xC0 . #xD6) "w" table)
+ (modify-syntax-entry '(#xD8 . #XF6) "w" table)
+ (modify-syntax-entry '(#xF8 . #X2FF) "w" table)
+ (modify-syntax-entry '(#x370 . #X37D) "w" table)
+ (modify-syntax-entry '(#x37F . #x1FFF) "w" table)
+ (modify-syntax-entry '(#x200C . #x200D) "w" table)
+ (modify-syntax-entry '(#x2070 . #x218F) "w" table)
+ (modify-syntax-entry '(#x2C00 . #x2FEF) "w" table)
+ (modify-syntax-entry '(#x3001 . #xD7FF) "w" table)
+ (modify-syntax-entry '(#xF900 . #xFDCF) "w" table)
+ (modify-syntax-entry '(#xFDF0 . #xFFFD) "w" table)
+ (modify-syntax-entry '(#x10000 . #xEFFFF) "w" table)
table)
- "Syntax table used by `xml-parse-region'.")
+ "Syntax table used by the XML parser.
+In this syntax table, the XML space characters [ \\t\\r\\n], and
+only those characters, have whitespace syntax.")
-;; XML [5]
-;; Note that [:alpha:] matches all multibyte chars with word syntax.
-(eval-and-compile
- (defconst xml-name-regexp "[[:alpha:]_:][[:alnum:]._:-]*"))
+;;; Entry points:
-;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
-;; document ::= prolog element Misc*
-;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
+;;;###autoload
+(defun xml-parse-file (file &optional parse-dtd parse-ns)
+ "Parse the well-formed XML file FILE.
+Return the top node with all its children.
+If PARSE-DTD is non-nil, the DTD is parsed rather than skipped.
+
+If PARSE-NS is non-nil, then QNAMES are expanded. By default,
+the variable `xml-default-ns' is the mapping from namespaces to
+URIs, and expanded names will be returned as a cons
+
+ (\"namespace:\" . \"foo\").
+
+If PARSE-NS is an alist, it will be used as the mapping from
+namespace to URIs instead.
+
+If it is the symbol `symbol-qnames', expanded names will be
+returned as a plain symbol `namespace:foo' instead of a cons.
+
+Both features can be combined by providing a cons cell
+
+ (symbol-qnames . ALIST)."
+ (with-temp-buffer
+ (insert-file-contents file)
+ (xml--parse-buffer parse-dtd parse-ns)))
;;;###autoload
-(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
+(defun xml-parse-region (&optional beg end buffer parse-dtd parse-ns)
"Parse the region from BEG to END in BUFFER.
+Return the XML parse tree, or raise an error if the region does
+not contain well-formed XML.
+
+If BEG is nil, it defaults to `point-min'.
+If END is nil, it defaults to `point-max'.
If BUFFER is nil, it defaults to the current buffer.
-Returns the XML list for the region, or raises an error if the region
-is not well-formed XML.
-If PARSE-DTD is non-nil, the DTD is parsed rather than skipped,
-and returned as the first element of the list.
-If PARSE-NS is non-nil, then QNAMES are expanded."
- (save-restriction
- (narrow-to-region beg end)
- ;; Use fixed syntax table to ensure regexp char classes and syntax
- ;; specs DTRT.
- (with-syntax-table (standard-syntax-table)
- (let ((case-fold-search nil) ; XML is case-sensitive.
- xml result dtd)
- (save-excursion
- (if buffer
- (set-buffer buffer))
- (goto-char (point-min))
- (while (not (eobp))
- (if (search-forward "<" nil t)
- (progn
- (forward-char -1)
- (setq result (xml-parse-tag parse-dtd parse-ns))
- (if (and xml result)
- ;; translation of rule [1] of XML specifications
- (error "XML files can have only one toplevel tag")
- (cond
- ((null result))
- ((and (listp (car result))
- parse-dtd)
- (setq dtd (car result))
- (if (cdr result) ; possible leading comment
- (add-to-list 'xml (cdr result))))
- (t
- (add-to-list 'xml result)))))
- (goto-char (point-max))))
- (if parse-dtd
- (cons dtd (nreverse xml))
- (nreverse xml)))))))
+If PARSE-DTD is non-nil, parse the DTD and return it as the first
+element of the list.
+If PARSE-NS is non-nil, then QNAMES are expanded. By default,
+the variable `xml-default-ns' is the mapping from namespaces to
+URIs, and expanded names will be returned as a cons
+
+ (\"namespace:\" . \"foo\").
+
+If PARSE-NS is an alist, it will be used as the mapping from
+namespace to URIs instead.
+
+If it is the symbol `symbol-qnames', expanded names will be
+returned as a plain symbol `namespace:foo' instead of a cons.
+
+Both features can be combined by providing a cons cell
+
+ (symbol-qnames . ALIST)."
+ ;; Use fixed syntax table to ensure regexp char classes and syntax
+ ;; specs DTRT.
+ (unless buffer
+ (setq buffer (current-buffer)))
+ (with-temp-buffer
+ (insert-buffer-substring-no-properties buffer beg end)
+ (xml--parse-buffer parse-dtd parse-ns)))
+
+;; XML [5]
+
+;; Fixme: This needs re-writing to deal with the XML grammar properly, i.e.
+;; document ::= prolog element Misc*
+;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
+
+(defun xml--parse-buffer (parse-dtd parse-ns)
+ (with-syntax-table xml-syntax-table
+ (let ((case-fold-search nil) ; XML is case-sensitive.
+ ;; Prevent entity definitions from changing the defaults
+ (xml-entity-alist xml-entity-alist)
+ (xml-parameter-entity-alist xml-parameter-entity-alist)
+ xml result dtd)
+ (goto-char (point-min))
+ (while (not (eobp))
+ (if (search-forward "<" nil t)
+ (progn
+ (forward-char -1)
+ (setq result (xml-parse-tag-1 parse-dtd parse-ns))
+ (cond
+ ((null result)
+ ;; Not looking at an xml start tag.
+ (unless (eobp)
+ (forward-char 1)))
+ ((and xml (not xml-sub-parser))
+ ;; Translation of rule [1] of XML specifications
+ (error "XML: (Not Well-Formed) Only one root tag allowed"))
+ ((and (listp (car result))
+ parse-dtd)
+ (setq dtd (car result))
+ (if (cdr result) ; possible leading comment
+ (add-to-list 'xml (cdr result))))
+ (t
+ (add-to-list 'xml result))))
+ (goto-char (point-max))))
+ (if parse-dtd
+ (cons dtd (nreverse xml))
+ (nreverse xml)))))
(defun xml-maybe-do-ns (name default xml-ns)
"Perform any namespace expansion.
During namespace-aware parsing, any name without a namespace is
put into the namespace identified by DEFAULT. nil is used to
-specify that the name shouldn't be given a namespace."
+specify that the name shouldn't be given a namespace.
+Expanded names will by default be returned as a cons. If you
+would like to get plain symbols instead, provide a cons cell
+
+ (symbol-qnames . ALIST)
+
+in the XML-NS argument."
(if (consp xml-ns)
- (let* ((nsp (string-match ":" name))
+ (let* ((symbol-qnames (eq (car-safe xml-ns) 'symbol-qnames))
+ (nsp (string-match ":" name))
(lname (if nsp (substring name (match-end 0)) name))
(prefix (if nsp (substring name 0 (match-beginning 0)) default))
(special (and (string-equal lname "xmlns") (not prefix)))
;; Setting default to nil will insure that there is not
;; matching cons in xml-ns. In which case we
(ns (or (cdr (assoc (if special "xmlns" prefix)
- xml-ns))
- :)))
- (cons ns (if special "" lname)))
+ (if symbol-qnames (cdr xml-ns) xml-ns)))
+ "")))
+ (if (and symbol-qnames
+ (not (string= prefix "xmlns")))
+ (intern (concat ns lname))
+ (cons ns (if special "" lname))))
(intern name)))
(defun xml-parse-tag (&optional parse-dtd parse-ns)
"Parse the tag at point.
If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
returned as the first element in the list.
-If PARSE-NS is non-nil, then QNAMES are expanded.
-Returns one of:
+If PARSE-NS is non-nil, expand QNAMES; for further details, see
+`xml-parse-region'.
+
+Return one of:
- a list : the matching node
- nil : the point is not looking at a tag.
- a pair : the first element is the DTD, the second is the node."
- (let ((xml-ns (if (consp parse-ns)
- parse-ns
- (if parse-ns
- (list
- ;; Default for empty prefix is no namespace
- (cons "" :)
- ;; "xml" namespace
- (cons "xml" :http://www.w3.org/XML/1998/namespace)
- ;; We need to seed the xmlns namespace
- (cons "xmlns" :http://www.w3.org/2000/xmlns/))))))
+ (let* ((case-fold-search nil)
+ ;; Prevent entity definitions from changing the defaults
+ (xml-entity-alist xml-entity-alist)
+ (xml-parameter-entity-alist xml-parameter-entity-alist)
+ (buf (current-buffer))
+ (pos (point)))
+ (with-temp-buffer
+ (with-syntax-table xml-syntax-table
+ (insert-buffer-substring-no-properties buf pos)
+ (goto-char (point-min))
+ (xml-parse-tag-1 parse-dtd parse-ns)))))
+
+(defun xml-parse-tag-1 (&optional parse-dtd parse-ns)
+ "Like `xml-parse-tag', but possibly modify the buffer while working."
+ (let* ((xml-validating-parser (or parse-dtd xml-validating-parser))
+ (xml-ns
+ (cond ((eq parse-ns 'symbol-qnames)
+ (cons 'symbol-qnames xml-default-ns))
+ ((or (consp (car-safe parse-ns))
+ (and (eq (car-safe parse-ns) 'symbol-qnames)
+ (listp (cdr parse-ns))))
+ parse-ns)
+ (parse-ns
+ xml-default-ns))))
(cond
- ;; Processing instructions (like the <?xml version="1.0"?> tag at the
- ;; beginning of a document).
- ((looking-at "<\\?")
+ ;; Processing instructions, like <?xml version="1.0"?>.
+ ((looking-at-p "<\\?")
(search-forward "?>")
(skip-syntax-forward " ")
- (xml-parse-tag parse-dtd xml-ns))
- ;; Character data (CDATA) sections, in which no tag should be interpreted
+ (xml-parse-tag-1 parse-dtd xml-ns))
+ ;; Character data (CDATA) sections, in which no tag should be interpreted
((looking-at "<!\\[CDATA\\[")
(let ((pos (match-end 0)))
(unless (search-forward "]]>" nil t)
- (error "CDATA section does not end anywhere in the document"))
- (buffer-substring pos (match-beginning 0))))
- ;; DTD for the document
- ((looking-at "<!DOCTYPE")
- (let (dtd)
- (if parse-dtd
- (setq dtd (xml-parse-dtd))
- (xml-skip-dtd))
- (skip-syntax-forward " ")
- (if dtd
- (cons dtd (xml-parse-tag nil xml-ns))
- (xml-parse-tag nil xml-ns))))
- ;; skip comments
- ((looking-at "<!--")
+ (error "XML: (Not Well Formed) CDATA section does not end anywhere in the document"))
+ (concat
+ (buffer-substring-no-properties pos (match-beginning 0))
+ (xml-parse-string))))
+ ;; DTD for the document
+ ((looking-at-p "<!DOCTYPE[ \t\n\r]")
+ (let ((dtd (xml-parse-dtd parse-ns)))
+ (skip-syntax-forward " ")
+ (if xml-validating-parser
+ (cons dtd (xml-parse-tag-1 nil xml-ns))
+ (xml-parse-tag-1 nil xml-ns))))
+ ;; skip comments
+ ((looking-at-p "<!--")
(search-forward "-->")
- nil)
- ;; end tag
- ((looking-at "</")
+ ;; FIXME: This loses the skipped-over spaces.
+ (skip-syntax-forward " ")
+ (unless (eobp)
+ (let ((xml-sub-parser t))
+ (xml-parse-tag-1 parse-dtd xml-ns))))
+ ;; end tag
+ ((looking-at-p "</")
'())
- ;; opening tag
- ((looking-at "<\\([^/>[:space:]]+\\)")
+ ;; opening tag
+ ((looking-at (eval-when-compile (concat "<\\(" xml-name-re "\\)")))
(goto-char (match-end 1))
-
;; Parse this node
- (let* ((node-name (match-string 1))
- ;; Parse the attribute list.
- (attrs (xml-parse-attlist xml-ns))
- children pos)
-
- ;; add the xmlns:* attrs to our cache
- (when (consp xml-ns)
+ (let* ((node-name (match-string-no-properties 1))
+ ;; Parse the attribute list.
+ (attrs (xml-parse-attlist xml-ns))
+ children)
+ ;; add the xmlns:* attrs to our cache
+ (when (consp xml-ns)
(dolist (attr attrs)
(when (and (consp (car attr))
- (eq :http://www.w3.org/2000/xmlns/
- (caar attr)))
- (push (cons (cdar attr) (intern (concat ":" (cdr attr))))
- xml-ns))))
-
- (setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
-
- ;; is this an empty element ?
- (if (looking-at "/>")
- (progn
+ (equal "http://www.w3.org/2000/xmlns/"
+ (caar attr)))
+ (push (cons (cdar attr) (cdr attr))
+ (if (symbolp (car xml-ns))
+ (cdr xml-ns)
+ xml-ns)))))
+ (setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
+ (cond
+ ;; is this an empty element ?
+ ((looking-at-p "/>")
(forward-char 2)
(nreverse children))
+ ;; is this a valid start tag ?
+ ((eq (char-after) ?>)
+ (forward-char 1)
+ ;; Now check that we have the right end-tag.
+ (let ((end (concat "</" node-name "\\s-*>")))
+ (while (not (looking-at end))
+ (cond
+ ((eobp)
+ (error "XML: (Not Well-Formed) End of document while reading element `%s'"
+ node-name))
+ ((looking-at-p "</")
+ (forward-char 2)
+ (error "XML: (Not Well-Formed) Invalid end tag `%s' (expecting `%s')"
+ (let ((pos (point)))
+ (buffer-substring pos (if (re-search-forward "\\s-*>" nil t)
+ (match-beginning 0)
+ (point-max))))
+ node-name))
+ ;; Read a sub-element and push it onto CHILDREN.
+ ((= (char-after) ?<)
+ (let ((tag (xml-parse-tag-1 nil xml-ns)))
+ (when tag
+ (push tag children))))
+ ;; Read some character data.
+ (t
+ (let ((expansion (xml-parse-string)))
+ (push (if (stringp (car children))
+ ;; If two strings were separated by a
+ ;; comment, concat them.
+ (concat (pop children) expansion)
+ expansion)
+ children)))))
+ ;; Move point past the end-tag.
+ (goto-char (match-end 0))
+ (nreverse children)))
+ ;; Otherwise this was an invalid start tag (expected ">" not found.)
+ (t
+ (error "XML: (Well-Formed) Couldn't parse tag: %s"
+ (buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
- ;; is this a valid start tag ?
- (if (eq (char-after) ?>)
- (progn
- (forward-char 1)
- ;; Now check that we have the right end-tag. Note that this
- ;; one might contain spaces after the tag name
- (let ((end (concat "</" node-name "\\s-*>")))
- (while (not (looking-at end))
- (cond
- ((looking-at "</")
- (error "XML: Invalid end tag (expecting %s) at pos %d"
- node-name (point)))
- ((= (char-after) ?<)
- (let ((tag (xml-parse-tag nil xml-ns)))
- (when tag
- (push tag children))))
- (t
- (setq pos (point))
- (search-forward "<")
- (forward-char -1)
- (let ((string (buffer-substring pos (point)))
- (pos 0))
-
- ;; Clean up the string. As per XML
- ;; specifications, the XML processor should
- ;; always pass the whole string to the
- ;; application. But \r's should be replaced:
- ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
- (while (string-match "\r\n?" string pos)
- (setq string (replace-match "\n" t t string))
- (setq pos (1+ (match-beginning 0))))
-
- (setq string (xml-substitute-special string))
- (setq children
- (if (stringp (car children))
- ;; The two strings were separated by a comment.
- (cons (concat (car children) string)
- (cdr children))
- (cons string children))))))))
-
- (goto-char (match-end 0))
- (nreverse children))
- ;; This was an invalid start tag
- (error "XML: Invalid attribute list")))))
- (t ;; This is not a tag.
- (error "XML: Invalid character")))))
+ ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
+ (t
+ (unless xml-sub-parser ; Usually, we error out.
+ (error "XML: (Well-Formed) Invalid character"))
+ ;; However, if we're parsing incrementally, then we need to deal
+ ;; with stray CDATA.
+ (let ((s (xml-parse-string)))
+ (when (zerop (length s))
+ ;; We haven't consumed any input! We must throw an error in
+ ;; order to prevent looping forever.
+ (error "XML: (Not Well-Formed) Could not parse: %s"
+ (buffer-substring-no-properties
+ (point) (min (+ (point) 10) (point-max)))))
+ s)))))
+
+(defun xml-parse-string ()
+ "Parse character data at point, and return it as a string.
+Leave point at the start of the next thing to parse. This
+function can modify the buffer by expanding entity and character
+references."
+ (let ((start (point))
+ ;; Keep track of the size of the rest of the buffer:
+ (old-remaining-size (- (buffer-size) (point)))
+ ref val)
+ (while (and (not (eobp))
+ (not (looking-at-p "<")))
+ ;; Find the next < or & character.
+ (skip-chars-forward "^<&")
+ (when (eq (char-after) ?&)
+ ;; If we find an entity or character reference, expand it.
+ (unless (looking-at xml-entity-or-char-ref-re)
+ (error "XML: (Not Well-Formed) Invalid entity reference"))
+ ;; For a character reference, the next entity or character
+ ;; reference must be after the replacement. [4.6] "Numerical
+ ;; character references are expanded immediately when
+ ;; recognized and MUST be treated as character data."
+ (if (setq ref (match-string 2))
+ (progn ; Numeric char reference
+ (setq val (save-match-data
+ (decode-char 'ucs (string-to-number
+ ref (if (match-string 1) 16)))))
+ (and (null val)
+ xml-validating-parser
+ (error "XML: (Validity) Invalid character reference `%s'"
+ (match-string 0)))
+ (replace-match (if val (string val) xml-undefined-entity) t t))
+ ;; For an entity reference, search again from the start of
+ ;; the replaced text, since the replacement can contain
+ ;; entity or character references, or markup.
+ (setq ref (match-string 3)
+ val (assoc ref xml-entity-alist))
+ (and (null val)
+ xml-validating-parser
+ (error "XML: (Validity) Undefined entity `%s'" ref))
+ (replace-match (or (cdr val) xml-undefined-entity) t t)
+ (goto-char (match-beginning 0)))
+ ;; Check for XML bombs.
+ (and xml-entity-expansion-limit
+ (> (- (buffer-size) (point))
+ (+ old-remaining-size xml-entity-expansion-limit))
+ (error "XML: Entity reference expansion \
+surpassed `xml-entity-expansion-limit'"))))
+ ;; [2.11] Clean up line breaks.
+ (let ((end-marker (point-marker)))
+ (goto-char start)
+ (while (re-search-forward "\r\n?" end-marker t)
+ (replace-match "\n" t t))
+ (goto-char end-marker)
+ (buffer-substring start (point)))))
(defun xml-parse-attlist (&optional xml-ns)
"Return the attribute-list after point.
end-pos name)
(skip-syntax-forward " ")
(while (looking-at (eval-when-compile
- (concat "\\(" xml-name-regexp "\\)\\s-*=\\s-*")))
+ (concat "\\(" xml-name-re "\\)\\s-*=\\s-*")))
(setq end-pos (match-end 0))
- (setq name (xml-maybe-do-ns (match-string 1) nil xml-ns))
+ (setq name (xml-maybe-do-ns (match-string-no-properties 1) nil xml-ns))
(goto-char end-pos)
;; See also: http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize
(setq end-pos (match-end 0))
(if (looking-at "'\\([^']*\\)'")
(setq end-pos (match-end 0))
- (error "XML: Attribute values must be given between quotes")))
+ (error "XML: (Not Well-Formed) Attribute values must be given between quotes")))
;; Each attribute must be unique within a given element
(if (assoc name attlist)
- (error "XML: each attribute must be unique within an element"))
+ (error "XML: (Not Well-Formed) Each attribute must be unique within an element"))
;; Multiple whitespace characters should be replaced with a single one
;; in the attributes
- (let ((string (match-string 1))
- (pos 0))
+ (let ((string (match-string-no-properties 1)))
(replace-regexp-in-string "\\s-\\{2,\\}" " " string)
- (push (cons name (xml-substitute-special string)) attlist))
+ (let ((expansion (xml-substitute-special string)))
+ (unless (stringp expansion)
+ ;; We say this is the constraint. It is actually that
+ ;; neither external entities nor "<" can be in an
+ ;; attribute value.
+ (error "XML: (Not Well-Formed) Entities in attributes cannot expand into elements"))
+ (push (cons name expansion) attlist)))
(goto-char end-pos)
(skip-syntax-forward " "))
(nreverse attlist)))
-;;*******************************************************************
-;;**
-;;** The DTD (document type declaration)
-;;** The following functions know how to skip or parse the DTD of
-;;** a document
-;;**
-;;*******************************************************************
+;;; DTD (document type declaration)
-;; Fixme: This fails at least if the DTD contains conditional sections.
+;; The following functions know how to skip or parse the DTD of a
+;; document. FIXME: it fails at least if the DTD contains conditional
+;; sections.
(defun xml-skip-dtd ()
"Skip the DTD at point.
This follows the rule [28] in the XML specifications."
- (forward-char (length "<!DOCTYPE"))
- (if (looking-at "\\s-*>")
- (error "XML: invalid DTD (excepting name of the document)"))
- (condition-case nil
- (progn
- (forward-sexp)
- (skip-syntax-forward " ")
- (if (looking-at "\\[")
- (re-search-forward "]\\s-*>")
- (search-forward ">")))
- (error (error "XML: No end to the DTD"))))
+ (let ((xml-validating-parser nil))
+ (xml-parse-dtd)))
-(defun xml-parse-dtd ()
+(defun xml-parse-dtd (&optional _parse-ns)
"Parse the DTD at point."
(forward-char (eval-when-compile (length "<!DOCTYPE")))
(skip-syntax-forward " ")
- (if (looking-at ">")
- (error "XML: invalid DTD (excepting name of the document)"))
+ (if (and (looking-at-p ">")
+ xml-validating-parser)
+ (error "XML: (Validity) Invalid DTD (expecting name of the document)"))
;; Get the name of the document
- (looking-at xml-name-regexp)
- (let ((dtd (list (match-string 0) 'dtd))
- type element end-pos)
+ (looking-at xml-name-re)
+ (let ((dtd (list (match-string-no-properties 0) 'dtd))
+ (xml-parameter-entity-alist xml-parameter-entity-alist)
+ next-parameter-entity)
(goto-char (match-end 0))
-
(skip-syntax-forward " ")
- ;; XML [75]
+
+ ;; External subset (XML [75])
(cond ((looking-at "PUBLIC\\s-+")
(goto-char (match-end 0))
(unless (or (re-search-forward
(re-search-forward
"\\='\\([[:space:][:alnum:]-()+,./:=?;!*#@$_%]*\\)'"
nil t))
- (error "XML: missing public id"))
- (let ((pubid (match-string 1)))
+ (error "XML: Missing Public ID"))
+ (let ((pubid (match-string-no-properties 1)))
+ (skip-syntax-forward " ")
(unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
(re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
- (error "XML: missing system id"))
- (push (list pubid (match-string 1) 'public) dtd)))
+ (error "XML: Missing System ID"))
+ (push (list pubid (match-string-no-properties 1) 'public) dtd)))
((looking-at "SYSTEM\\s-+")
(goto-char (match-end 0))
(unless (or (re-search-forward "\\='\\([^']*\\)'" nil t)
(re-search-forward "\\=\"\\([^\"]*\\)\"" nil t))
- (error "XML: missing system id"))
- (push (list (match-string 1) 'system) dtd)))
+ (error "XML: Missing System ID"))
+ (push (list (match-string-no-properties 1) 'system) dtd)))
(skip-syntax-forward " ")
- (if (eq ?> (char-after))
- (forward-char)
- (skip-syntax-forward " ")
- (if (not (eq (char-after) ?\[))
- (error "XML: bad DTD")
+
+ (if (eq (char-after) ?>)
+
+ ;; No internal subset
(forward-char)
- ;; Parse the rest of the DTD
- ;; Fixme: Deal with ENTITY, ATTLIST, NOTATION, PIs.
- (while (not (looking-at "\\s-*\\]"))
- (skip-syntax-forward " ")
- (cond
-
- ;; Translation of rule [45] of XML specifications
- ((looking-at
- "<!ELEMENT\\s-+\\([[:alnum:].%;]+\\)\\s-+\\([^>]+\\)>")
-
- (setq element (match-string 1)
- type (match-string-no-properties 2))
- (setq end-pos (match-end 0))
- ;; Translation of rule [46] of XML specifications
+ ;; Internal subset (XML [28b])
+ (unless (eq (char-after) ?\[)
+ (error "XML: Bad DTD"))
+ (forward-char)
+
+ ;; [2.8]: "markup declarations may be made up in whole or in
+ ;; part of the replacement text of parameter entities."
+
+ ;; Since parameter entities are valid only within the DTD, we
+ ;; first search for the position of the next possible parameter
+ ;; entity. Then, search for the next DTD element; if it ends
+ ;; before the next parameter entity, expand the parameter entity
+ ;; and try again.
+ (setq next-parameter-entity
+ (save-excursion
+ (if (re-search-forward xml-pe-reference-re nil t)
+ (match-beginning 0))))
+
+ ;; Parse the rest of the DTD
+ ;; Fixme: Deal with NOTATION, PIs.
+ (while (not (looking-at-p "\\s-*\\]"))
+ (skip-syntax-forward " ")
+ (cond
+ ((eobp)
+ (error "XML: (Well-Formed) End of document while reading DTD"))
+ ;; Element declaration [45]:
+ ((and (looking-at (eval-when-compile
+ (concat "<!ELEMENT\\s-+\\(" xml-name-re
+ "\\)\\s-+\\([^>]+\\)>")))
+ (or (null next-parameter-entity)
+ (<= (match-end 0) next-parameter-entity)))
+ (let ((element (match-string-no-properties 1))
+ (type (match-string-no-properties 2))
+ (end-pos (match-end 0)))
+ ;; Translation of rule [46] of XML specifications
(cond
- ((string-match "^EMPTY[ \t\n\r]*$" type) ;; empty declaration
+ ((string-match-p "\\`EMPTY\\s-*\\'" type) ; empty declaration
(setq type 'empty))
- ((string-match "^ANY[ \t\n\r]*$" type) ;; any type of contents
+ ((string-match-p "\\`ANY\\s-*$" type) ; any type of contents
(setq type 'any))
- ((string-match "^(\\(.*\\))[ \t\n\r]*$" type) ;; children ([47])
- (setq type (xml-parse-elem-type (match-string 1 type))))
- ((string-match "^%[^;]+;[ \t\n\r]*$" type) ;; substitution
+ ((string-match "\\`(\\(.*\\))\\s-*\\'" type) ; children ([47])
+ (setq type (xml-parse-elem-type
+ (match-string-no-properties 1 type))))
+ ((string-match-p "^%[^;]+;[ \t\n\r]*\\'" type) ; substitution
nil)
- (t
- (error "XML: Invalid element type in the DTD")))
+ (xml-validating-parser
+ (error "XML: (Validity) Invalid element type in the DTD")))
- ;; rule [45]: the element declaration must be unique
- (if (assoc element dtd)
- (error "XML: element declarations must be unique in a DTD (<%s>)"
- element))
+ ;; rule [45]: the element declaration must be unique
+ (and (assoc element dtd)
+ xml-validating-parser
+ (error "XML: (Validity) DTD element declarations must be unique (<%s>)"
+ element))
;; Store the element in the DTD
(push (list element type) dtd)
- (goto-char end-pos))
- ((looking-at "<!--")
- (search-forward "-->"))
-
- (t
- (error "XML: Invalid DTD item")))
+ (goto-char end-pos)))
+
+ ;; Attribute-list declaration [52] (currently unsupported):
+ ((and (looking-at (eval-when-compile
+ (concat "<!ATTLIST[ \t\n\r]*\\(" xml-name-re
+ "\\)[ \t\n\r]*\\(" xml-att-def-re
+ "\\)*[ \t\n\r]*>")))
+ (or (null next-parameter-entity)
+ (<= (match-end 0) next-parameter-entity)))
+ (goto-char (match-end 0)))
+
+ ;; Comments (skip to end, ignoring parameter entity):
+ ((looking-at-p "<!--")
+ (search-forward "-->")
+ (and next-parameter-entity
+ (> (point) next-parameter-entity)
+ (setq next-parameter-entity
+ (save-excursion
+ (if (re-search-forward xml-pe-reference-re nil t)
+ (match-beginning 0))))))
+
+ ;; Internal entity declarations:
+ ((and (looking-at (eval-when-compile
+ (concat "<!ENTITY[ \t\n\r]+\\(%[ \t\n\r]+\\)?\\("
+ xml-name-re "\\)[ \t\n\r]*\\("
+ xml-entity-value-re "\\)[ \t\n\r]*>")))
+ (or (null next-parameter-entity)
+ (<= (match-end 0) next-parameter-entity)))
+ (let* ((name (prog1 (match-string-no-properties 2)
+ (goto-char (match-end 0))))
+ (alist (if (match-string 1)
+ 'xml-parameter-entity-alist
+ 'xml-entity-alist))
+ ;; Retrieve the deplacement text:
+ (value (xml--entity-replacement-text
+ ;; Entity value, sans quotation marks:
+ (substring (match-string-no-properties 3) 1 -1))))
+ ;; If the same entity is declared more than once, the
+ ;; first declaration is binding.
+ (unless (assoc name (symbol-value alist))
+ (set alist (cons (cons name value) (symbol-value alist))))))
+
+ ;; External entity declarations (currently unsupported):
+ ((and (or (looking-at (eval-when-compile
+ (concat "<!ENTITY[ \t\n\r]+\\(%[ \t\n\r]+\\)?\\("
+ xml-name-re "\\)[ \t\n\r]+SYSTEM[ \t\n\r]+"
+ "\\(\"[^\"]*\"\\|'[^']*'\\)[ \t\n\r]*>")))
+ (looking-at (eval-when-compile
+ (concat "<!ENTITY[ \t\n\r]+\\(%[ \t\n\r]+\\)?\\("
+ xml-name-re "\\)[ \t\n\r]+PUBLIC[ \t\n\r]+"
+ "\"[- \r\na-zA-Z0-9'()+,./:=?;!*#@$_%]*\""
+ "\\|'[- \r\na-zA-Z0-9()+,./:=?;!*#@$_%]*'"
+ "[ \t\n\r]+\\(\"[^\"]*\"\\|'[^']*'\\)"
+ "[ \t\n\r]*>"))))
+ (or (null next-parameter-entity)
+ (<= (match-end 0) next-parameter-entity)))
+ (goto-char (match-end 0)))
+
+ ;; If a parameter entity is in the way, expand it.
+ (next-parameter-entity
+ (save-excursion
+ (goto-char next-parameter-entity)
+ (unless (looking-at xml-pe-reference-re)
+ (error "XML: Internal error"))
+ (let* ((entity (match-string 1))
+ (elt (assoc entity xml-parameter-entity-alist)))
+ (if elt
+ (progn
+ (replace-match (cdr elt) t t)
+ ;; The replacement can itself be a parameter entity.
+ (goto-char next-parameter-entity))
+ (goto-char (match-end 0))))
+ (setq next-parameter-entity
+ (if (re-search-forward xml-pe-reference-re nil t)
+ (match-beginning 0)))))
+
+ ;; Anything else is garbage (ignored if not validating).
+ (xml-validating-parser
+ (error "XML: (Validity) Invalid DTD item"))
+ (t
+ (skip-chars-forward "^]"))))
- ;; Skip the end of the DTD
- (search-forward ">"))))
+ (if (looking-at "\\s-*]>")
+ (goto-char (match-end 0))))
(nreverse dtd)))
+(defun xml--entity-replacement-text (string)
+ "Return the replacement text for the entity value STRING.
+The replacement text is obtained by replacing character
+references and parameter-entity references."
+ (let ((ref-re (eval-when-compile
+ (concat "\\(?:&#\\([0-9]+\\)\\|&#x\\([0-9a-fA-F]+\\)\\|%\\("
+ xml-name-re "\\)\\);")))
+ children)
+ (while (string-match ref-re string)
+ (push (substring string 0 (match-beginning 0)) children)
+ (let ((remainder (substring string (match-end 0)))
+ ref val)
+ (cond ((setq ref (match-string 1 string))
+ ;; Decimal character reference
+ (setq val (decode-char 'ucs (string-to-number ref)))
+ (if val (push (string val) children)))
+ ;; Hexadecimal character reference
+ ((setq ref (match-string 2 string))
+ (setq val (decode-char 'ucs (string-to-number ref 16)))
+ (if val (push (string val) children)))
+ ;; Parameter entity reference
+ ((setq ref (match-string 3 string))
+ (setq val (assoc ref xml-parameter-entity-alist))
+ (and (null val)
+ xml-validating-parser
+ (error "XML: (Validity) Undefined parameter entity `%s'" ref))
+ (push (or (cdr val) xml-undefined-entity) children)))
+ (setq string remainder)))
+ (mapconcat 'identity (nreverse (cons string children)) "")))
+
(defun xml-parse-elem-type (string)
"Convert element type STRING into a Lisp structure."
(let (elem modifier)
(if (string-match "(\\([^)]+\\))\\([+*?]?\\)" string)
(progn
- (setq elem (match-string 1 string)
- modifier (match-string 2 string))
- (if (string-match "|" elem)
+ (setq elem (match-string-no-properties 1 string)
+ modifier (match-string-no-properties 2 string))
+ (if (string-match-p "|" elem)
(setq elem (cons 'choice
(mapcar 'xml-parse-elem-type
(split-string elem "|"))))
- (if (string-match "," elem)
+ (if (string-match-p "," elem)
(setq elem (cons 'seq
(mapcar 'xml-parse-elem-type
(split-string elem ",")))))))
(if (string-match "[ \t\n\r]*\\([^+*?]+\\)\\([+*?]?\\)" string)
- (setq elem (match-string 1 string)
- modifier (match-string 2 string))))
+ (setq elem (match-string-no-properties 1 string)
+ modifier (match-string-no-properties 2 string))))
(if (and (stringp elem) (string= elem "#PCDATA"))
(setq elem 'pcdata))
(t
elem))))
-;;*******************************************************************
-;;**
-;;** Substituting special XML sequences
-;;**
-;;*******************************************************************
-
-(eval-when-compile
- (defvar str)) ; dynamic from replace-regexp-in-string
-
-;; Fixme: Take declared entities from the DTD when they're available.
-(defun xml-substitute-entity (match)
- "Subroutine of `xml-substitute-special'."
- (save-match-data
- (let ((match1 (match-string 1 str)))
- (cond ((string= match1 "lt") "<")
- ((string= match1 "gt") ">")
- ((string= match1 "apos") "'")
- ((string= match1 "quot") "\"")
- ((string= match1 "amp") "&")
- ((and (string-match "#\\([0-9]+\\)" match1)
- (let ((c (decode-char
- 'ucs
- (string-to-number (match-string 1 match1)))))
- (if c (string c))))) ; else unrepresentable
- ((and (string-match "#x\\([[:xdigit:]]+\\)" match1)
- (let ((c (decode-char
- 'ucs
- (string-to-number (match-string 1 match1) 16))))
- (if c (string c)))))
- ;; Default to asis. Arguably, unrepresentable code points
- ;; might be best replaced with U+FFFD.
- (t match)))))
+;;; Substituting special XML sequences
(defun xml-substitute-special (string)
- "Return STRING, after subsituting entity references."
- ;; This originally made repeated passes through the string from the
- ;; beginning, which isn't correct, since then either "&amp;" or
- ;; "&amp;" won't DTRT.
- (replace-regexp-in-string "&\\([^;]+\\);"
- #'xml-substitute-entity string t t))
-
-;;*******************************************************************
-;;**
-;;** Printing a tree.
-;;** This function is intended mainly for debugging purposes.
-;;**
-;;*******************************************************************
+ "Return STRING, after substituting entity and character references.
+STRING is assumed to occur in an XML attribute value."
+ (let ((strlen (length string))
+ children)
+ (while (string-match xml-entity-or-char-ref-re string)
+ (push (substring string 0 (match-beginning 0)) children)
+ (let* ((remainder (substring string (match-end 0)))
+ (is-hex (match-string 1 string)) ; Is it a hex numeric reference?
+ (ref (match-string 2 string))) ; Numeric part of reference
+ (if ref
+ ;; [4.6] Character references are included as
+ ;; character data.
+ (let ((val (decode-char 'ucs (string-to-number ref (if is-hex 16)))))
+ (push (cond (val (string val))
+ (xml-validating-parser
+ (error "XML: (Validity) Undefined character `x%s'" ref))
+ (t xml-undefined-entity))
+ children)
+ (setq string remainder
+ strlen (length string)))
+ ;; [4.4.5] Entity references are "included in literal".
+ ;; Note that we don't need do anything special to treat
+ ;; quotes as normal data characters.
+ (setq ref (match-string 3 string)) ; entity name
+ (let ((val (or (cdr (assoc ref xml-entity-alist))
+ (if xml-validating-parser
+ (error "XML: (Validity) Undefined entity `%s'" ref)
+ xml-undefined-entity))))
+ (setq string (concat val remainder)))
+ (and xml-entity-expansion-limit
+ (> (length string) (+ strlen xml-entity-expansion-limit))
+ (error "XML: Passed `xml-entity-expansion-limit' while expanding `&%s;'"
+ ref)))))
+ (mapconcat 'identity (nreverse (cons string children)) "")))
+
+(defun xml-substitute-numeric-entities (string)
+ "Substitute SGML numeric entities by their respective utf characters.
+This function replaces numeric entities in the input STRING and
+returns the modified string. For example \"*\" gets replaced
+by \"*\"."
+ (if (and string (stringp string))
+ (let ((start 0))
+ (while (string-match "&#\\([0-9]+\\);" string start)
+ (ignore-errors
+ (setq string (replace-match
+ (string (read (substring string
+ (match-beginning 1)
+ (match-end 1))))
+ nil nil string)))
+ (setq start (1+ (match-beginning 0))))
+ string)
+ nil))
+
+;;; Printing a parse tree (mainly for debugging).
(defun xml-debug-print (xml &optional indent-string)
"Outputs the XML in the current buffer.
(defalias 'xml-print 'xml-debug-print)
+(defun xml-escape-string (string)
+ "Convert STRING into a string containing valid XML character data.
+Replace occurrences of &<>\\='\" in STRING with their default XML
+entity references (e.g., replace each & with &).
+
+XML character data must not contain & or < characters, nor the >
+character under some circumstances. The XML spec does not impose
+restriction on \" or \\=', but we just substitute for these too
+\(as is permitted by the spec)."
+ (with-temp-buffer
+ (insert string)
+ (dolist (substitution '(("&" . "&")
+ ("<" . "<")
+ (">" . ">")
+ ("'" . "'")
+ ("\"" . """)))
+ (goto-char (point-min))
+ (while (search-forward (car substitution) nil t)
+ (replace-match (cdr substitution) t t nil)))
+ (buffer-string)))
+
(defun xml-debug-print-internal (xml indent-string)
"Outputs the XML tree in the current buffer.
The first line is indented with INDENT-STRING."
;; output the attribute list
(setq attlist (xml-node-attributes tree))
(while attlist
- (insert ?\ (symbol-name (caar attlist)) "=\"" (cdar attlist) ?\")
+ (insert ?\ (symbol-name (caar attlist)) "=\""
+ (xml-escape-string (cdar attlist)) ?\")
(setq attlist (cdr attlist)))
(setq tree (xml-node-children tree))
((listp node)
(insert ?\n)
(xml-debug-print-internal node (concat indent-string " ")))
- ((stringp node) (insert node))
+ ((stringp node)
+ (insert (xml-escape-string node)))
(t
(error "Invalid XML tree"))))
(provide 'xml)
-;; arch-tag: 5864b283-5a68-4b59-a20d-36a72b353b9b
;;; xml.el ends here