xml-query.el (8251B)
1 ;;; xml-query.el --- query engine complimenting the xml package 2 3 ;; This is free and unencumbered software released into the public domain. 4 5 ;;; Commentary: 6 7 ;; This provides a very rudimentary, jQuery-like, XML selector 8 ;; s-expression language. It operates on the output of the xml 9 ;; package, such as `xml-parse-region' and `xml-parse-file'. It was 10 ;; written to support Elfeed. 11 12 ;; See the docstring for `xml-query-all'. 13 14 ;; The macro forms, `xml-query*' and `xml-query-all*', are an order of 15 ;; magnitude faster, but only work on static selectors and need the 16 ;; namespaces to be pre-stripped. 17 18 ;; Examples: 19 20 ;; This query grabs the top-level paragraph content from XHTML. 21 22 ;; (xml-query-all '(html body p *) xhtml) 23 24 ;; This query extracts all the links from an Atom feed. 25 26 ;; (xml-query-all '(feed entry link [rel "alternate"] :href) xml) 27 28 ;;; Code: 29 30 (require 'cl-lib) 31 32 (defun xml-query-strip-ns (tag) 33 "Remove the namespace, if any, from TAG." 34 (when (symbolp tag) 35 (let ((name (symbol-name tag))) 36 (if (cl-find ?\: name) 37 (intern (replace-regexp-in-string "^.+:" "" name)) 38 tag)))) 39 40 (defun xml-query--tag-all (match xml) 41 (cl-loop for (tag attribs . content) in (cl-remove-if-not #'listp xml) 42 when (or (eq tag match) (eq (xml-query-strip-ns tag) match)) 43 collect (cons tag (cons attribs content)))) 44 45 (defun xml-query--attrib-all (attrib value xml) 46 (cl-loop for (tag attribs . content) in (cl-remove-if-not #'listp xml) 47 when (equal (cdr (assoc attrib attribs)) value) 48 collect (cons tag (cons attribs content)))) 49 50 (defun xml-query--keyword (matcher xml) 51 (cl-loop with match = (intern (substring (symbol-name matcher) 1)) 52 for (tag attribs . content) in (cl-remove-if-not #'listp xml) 53 when (cdr (assoc match attribs)) 54 collect it)) 55 56 (defun xml-query--symbol (matcher xml) 57 (xml-query--tag-all matcher xml)) 58 59 (defun xml-query--vector (matcher xml) 60 (let ((attrib (aref matcher 0)) 61 (value (aref matcher 1))) 62 (xml-query--attrib-all attrib value xml))) 63 64 (defun xml-query--list (matchers xml) 65 (cl-loop for matcher in matchers 66 append (xml-query-all (if (listp matcher) 67 matcher 68 (list matcher)) xml))) 69 70 (defun xml-query--append (xml) 71 (cl-loop for (tag attribs . content) in (cl-remove-if-not #'listp xml) 72 append content)) 73 74 (defun xml-query--stringp (thing) 75 "Return non-nil of THING is a non-blank string." 76 (and (stringp thing) (string-match "[^ \t\r\n]" thing))) 77 78 (defun xml-query-all (query xml) 79 "Given a list of tags, XML, apply QUERY and return a list of 80 matching tags. 81 82 A query is a list of matchers. 83 - SYMBOL: filters to matching tags 84 - LIST: each element is a full sub-query, whose results are concatenated 85 - VECTOR: filters to tags with matching attribute, [tag attrib value] 86 - KEYWORD: filters to an attribute value (must be last) 87 - * (an asterisk symbol): filters to content strings (must be last) 88 89 For example, to find all the 'alternate' link URL in a typical 90 Atom feed: 91 92 (xml-query-all '(feed entry link [rel \"alternate\"] :href) xml)" 93 (if (null query) 94 xml 95 (cl-destructuring-bind (matcher . rest) query 96 (cond 97 ((keywordp matcher) (xml-query--keyword matcher xml)) 98 ((eq matcher '*) 99 (cl-remove-if-not #'xml-query--stringp (xml-query--append xml))) 100 (:else 101 (let ((matches 102 (cl-etypecase matcher 103 (symbol (xml-query--symbol matcher xml)) 104 (vector (xml-query--vector matcher xml)) 105 (list (xml-query--list matcher xml))))) 106 (cond 107 ((null rest) matches) 108 ((and (or (symbolp (car rest)) 109 (listp (car rest))) 110 (not (keywordp (car rest))) 111 (not (eq '* (car rest)))) 112 (xml-query-all (cdr query) (xml-query--append matches))) 113 (:else (xml-query-all rest matches))))))))) 114 115 (defun xml-query (query xml) 116 "Like `xml-query-all' but only return the first result." 117 (let ((result (xml-query-all query xml))) 118 (if (xml-query--stringp result) 119 result 120 (car (xml-query-all query xml))))) 121 122 ;; Macro alternatives: 123 124 ;; This is a slightly less capable alternative with significantly 125 ;; better performance (x10 speedup) that requires a static selector. 126 ;; The selector is compiled into Lisp code via macro at compile-time, 127 ;; which is then carried through to byte-code by the compiler. In 128 ;; byte-code form, the macro performs no function calls other than 129 ;; `throw' in the case of `xml-query*', where it's invoked less than 130 ;; once per evaluation (only on success). 131 132 ;; Queries are compiled tail-to-head with a result handler at the 133 ;; deepest level. The generated code makes multiple bindings of the 134 ;; variable "v" as it dives deeper into the query, using the layers of 135 ;; bindings as a breadcrumb stack. 136 137 ;; For `xml-query*', which has a single result, the whole expression 138 ;; is wrapped in a catch, and the first successful match is thrown to 139 ;; it from the result handler. 140 141 ;; For `xml-query-all*', the result is pushed into an output list. 142 143 (defun xml-query--compile-tag (tag subexp subloop-p) 144 `(when (and (consp v) (eq (car v) ',tag)) 145 ,(if subloop-p 146 `(dolist (v (cddr v)) 147 ,subexp) 148 subexp))) 149 150 (defun xml-query--compile-attrib (pair subexp subloop-p) 151 `(let ((value (cdr (assq ',(aref pair 0) (cadr v))))) 152 (when (equal value ,(aref pair 1)) 153 ,(if subloop-p 154 `(dolist (v (cddr v)) 155 ,subexp) 156 subexp)))) 157 158 (defun xml-query--compile-keyword (keyword subexp) 159 (let ((attrib (intern (substring (symbol-name keyword) 1)))) 160 `(let ((v (cdr (assq ',attrib (cadr v))))) 161 (when v 162 ,subexp)))) 163 164 (defun xml-query--compile-star (subexp) 165 `(when (and (stringp v) (string-match "[^ \t\r\n]" v)) 166 ,subexp)) 167 168 (defun xml-query--compile-top (query input subexp) 169 (let* ((rquery (reverse query)) 170 (prev nil)) 171 (while rquery 172 (let ((matcher (pop rquery)) 173 ;; Should the next item loop over its children? 174 (subloop-p (and (not (null prev)) 175 (not (keywordp prev)) 176 (symbolp prev)))) 177 (cond 178 ((eq '* matcher) 179 (setf subexp (xml-query--compile-star subexp))) 180 ((keywordp matcher) 181 (setf subexp (xml-query--compile-keyword matcher subexp))) 182 ((symbolp matcher) 183 (setf subexp (xml-query--compile-tag matcher subexp subloop-p))) 184 ((vectorp matcher) 185 (setf subexp (xml-query--compile-attrib matcher subexp subloop-p))) 186 ((error "Bad query: %S" query))) 187 (setf prev matcher))) 188 `(dolist (v ,input) 189 ,subexp))) 190 191 (defun xml-query--compile (query input) 192 (let ((tag (make-symbol "done"))) 193 `(catch ',tag 194 ,(xml-query--compile-top query input `(throw ',tag v))))) 195 196 (defmacro xml-query* (query sexp) 197 "Like `xml-query' but generate code to execute QUERY on SEXP. 198 199 Unlike `xml-query', QUERY must be a static, compile-time 200 s-expression. See `xml-query-all*' for more information. 201 202 QUERY is *not* evaluated, so it should not be quoted." 203 (xml-query--compile query sexp)) 204 205 (defun xml-query-all--compile (query input) 206 (let ((output (make-symbol "output"))) 207 `(let ((,output ())) 208 ,(xml-query--compile-top query input `(push v ,output)) 209 (nreverse ,output)))) 210 211 (defmacro xml-query-all* (query sexp) 212 "Like `xml-query-all' but generate code to execute QUERY on SEXP. 213 214 Unlike `xml-query-all', QUERY must be a static, compile-time 215 s-expression. This macro compiles the query into actual code. The 216 result is faster since the query will be compiled into byte-code 217 rather than \"interpreted\" at run time. 218 219 Also unlike `xml-query-all', the parsed XML s-expression must 220 also have its namespace pre-stripped. This is accomplished by 221 setting the optional PARSE-NS argument of `xml-parse-region' to 222 symbol-qnames. 223 224 Sub-expression lists are not supported by this macro. 225 226 QUERY is *not* evaluated, so it should not be quoted." 227 (xml-query-all--compile query sexp)) 228 229 (provide 'xml-query) 230 231 ;;; xml-query.el ends here