dotemacs

My Emacs configuration
git clone git://git.entf.net/dotemacs
Log | Files | Refs | LICENSE

xml-query.el (8251B)


      1 ;;; xml-query.el --- query engine complimenting the xml package
      2 
      3 ;; This is free and unencumbered software released into the public domain.
      4 
      5 ;;; Commentary:
      6 
      7 ;; This provides a very rudimentary, jQuery-like, XML selector
      8 ;; s-expression language. It operates on the output of the xml
      9 ;; package, such as `xml-parse-region' and `xml-parse-file'. It was
     10 ;; written to support Elfeed.
     11 
     12 ;; See the docstring for `xml-query-all'.
     13 
     14 ;; The macro forms, `xml-query*' and `xml-query-all*', are an order of
     15 ;; magnitude faster, but only work on static selectors and need the
     16 ;; namespaces to be pre-stripped.
     17 
     18 ;; Examples:
     19 
     20 ;; This query grabs the top-level paragraph content from XHTML.
     21 
     22 ;;  (xml-query-all '(html body p *) xhtml)
     23 
     24 ;; This query extracts all the links from an Atom feed.
     25 
     26 ;;  (xml-query-all '(feed entry link [rel "alternate"] :href) xml)
     27 
     28 ;;; Code:
     29 
     30 (require 'cl-lib)
     31 
     32 (defun xml-query-strip-ns (tag)
     33   "Remove the namespace, if any, from TAG."
     34   (when (symbolp tag)
     35     (let ((name (symbol-name tag)))
     36       (if (cl-find ?\: name)
     37           (intern (replace-regexp-in-string "^.+:" "" name))
     38         tag))))
     39 
     40 (defun xml-query--tag-all (match xml)
     41   (cl-loop for (tag attribs . content) in (cl-remove-if-not #'listp xml)
     42            when (or (eq tag match) (eq (xml-query-strip-ns tag) match))
     43            collect (cons tag (cons attribs content))))
     44 
     45 (defun xml-query--attrib-all (attrib value xml)
     46   (cl-loop for (tag attribs . content) in (cl-remove-if-not #'listp xml)
     47            when (equal (cdr (assoc attrib attribs)) value)
     48            collect (cons tag (cons attribs content))))
     49 
     50 (defun xml-query--keyword (matcher xml)
     51   (cl-loop with match = (intern (substring (symbol-name matcher) 1))
     52            for (tag attribs . content) in (cl-remove-if-not #'listp xml)
     53            when (cdr (assoc match attribs))
     54            collect it))
     55 
     56 (defun xml-query--symbol (matcher xml)
     57   (xml-query--tag-all matcher xml))
     58 
     59 (defun xml-query--vector (matcher xml)
     60   (let ((attrib (aref matcher 0))
     61         (value (aref matcher 1)))
     62     (xml-query--attrib-all attrib value xml)))
     63 
     64 (defun xml-query--list (matchers xml)
     65   (cl-loop for matcher in matchers
     66            append (xml-query-all (if (listp matcher)
     67                                      matcher
     68                                    (list matcher)) xml)))
     69 
     70 (defun xml-query--append (xml)
     71   (cl-loop for (tag attribs . content) in (cl-remove-if-not #'listp xml)
     72            append content))
     73 
     74 (defun xml-query--stringp (thing)
     75   "Return non-nil of THING is a non-blank string."
     76   (and (stringp thing) (string-match "[^ \t\r\n]" thing)))
     77 
     78 (defun xml-query-all (query xml)
     79   "Given a list of tags, XML, apply QUERY and return a list of
     80 matching tags.
     81 
     82 A query is a list of matchers.
     83  - SYMBOL: filters to matching tags
     84  - LIST: each element is a full sub-query, whose results are concatenated
     85  - VECTOR: filters to tags with matching attribute, [tag attrib value]
     86  - KEYWORD: filters to an attribute value (must be last)
     87  - * (an asterisk symbol): filters to content strings (must be last)
     88 
     89 For example, to find all the 'alternate' link URL in a typical
     90 Atom feed:
     91 
     92   (xml-query-all '(feed entry link [rel \"alternate\"] :href) xml)"
     93   (if (null query)
     94       xml
     95     (cl-destructuring-bind (matcher . rest) query
     96       (cond
     97        ((keywordp matcher) (xml-query--keyword matcher xml))
     98        ((eq matcher '*)
     99         (cl-remove-if-not #'xml-query--stringp (xml-query--append xml)))
    100        (:else
    101         (let ((matches
    102                (cl-etypecase matcher
    103                  (symbol (xml-query--symbol matcher xml))
    104                  (vector (xml-query--vector matcher xml))
    105                  (list   (xml-query--list   matcher xml)))))
    106           (cond
    107            ((null rest) matches)
    108            ((and (or (symbolp (car rest))
    109                      (listp (car rest)))
    110                  (not (keywordp (car rest)))
    111                  (not (eq '* (car rest))))
    112             (xml-query-all (cdr query) (xml-query--append matches)))
    113            (:else (xml-query-all rest matches)))))))))
    114 
    115 (defun xml-query (query xml)
    116   "Like `xml-query-all' but only return the first result."
    117   (let ((result (xml-query-all query xml)))
    118     (if (xml-query--stringp result)
    119         result
    120       (car (xml-query-all query xml)))))
    121 
    122 ;; Macro alternatives:
    123 
    124 ;; This is a slightly less capable alternative with significantly
    125 ;; better performance (x10 speedup) that requires a static selector.
    126 ;; The selector is compiled into Lisp code via macro at compile-time,
    127 ;; which is then carried through to byte-code by the compiler. In
    128 ;; byte-code form, the macro performs no function calls other than
    129 ;; `throw' in the case of `xml-query*', where it's invoked less than
    130 ;; once per evaluation (only on success).
    131 
    132 ;; Queries are compiled tail-to-head with a result handler at the
    133 ;; deepest level. The generated code makes multiple bindings of the
    134 ;; variable "v" as it dives deeper into the query, using the layers of
    135 ;; bindings as a breadcrumb stack.
    136 
    137 ;; For `xml-query*', which has a single result, the whole expression
    138 ;; is wrapped in a catch, and the first successful match is thrown to
    139 ;; it from the result handler.
    140 
    141 ;; For `xml-query-all*', the result is pushed into an output list.
    142 
    143 (defun xml-query--compile-tag (tag subexp subloop-p)
    144   `(when (and (consp v) (eq (car v) ',tag))
    145      ,(if subloop-p
    146           `(dolist (v (cddr v))
    147              ,subexp)
    148         subexp)))
    149 
    150 (defun xml-query--compile-attrib (pair subexp subloop-p)
    151   `(let ((value (cdr (assq ',(aref pair 0) (cadr v)))))
    152      (when (equal value ,(aref pair 1))
    153        ,(if subloop-p
    154             `(dolist (v (cddr v))
    155                ,subexp)
    156           subexp))))
    157 
    158 (defun xml-query--compile-keyword (keyword subexp)
    159   (let ((attrib (intern (substring (symbol-name keyword) 1))))
    160     `(let ((v (cdr (assq ',attrib (cadr v)))))
    161        (when v
    162          ,subexp))))
    163 
    164 (defun xml-query--compile-star (subexp)
    165   `(when (and (stringp v) (string-match "[^ \t\r\n]" v))
    166      ,subexp))
    167 
    168 (defun xml-query--compile-top (query input subexp)
    169   (let* ((rquery (reverse query))
    170          (prev nil))
    171     (while rquery
    172       (let ((matcher (pop rquery))
    173             ;; Should the next item loop over its children?
    174             (subloop-p (and (not (null prev))
    175                             (not (keywordp prev))
    176                             (symbolp prev))))
    177         (cond
    178          ((eq '* matcher)
    179           (setf subexp (xml-query--compile-star subexp)))
    180          ((keywordp matcher)
    181           (setf subexp (xml-query--compile-keyword matcher subexp)))
    182          ((symbolp matcher)
    183           (setf subexp (xml-query--compile-tag matcher subexp subloop-p)))
    184          ((vectorp matcher)
    185           (setf subexp (xml-query--compile-attrib matcher subexp subloop-p)))
    186          ((error "Bad query: %S" query)))
    187         (setf prev matcher)))
    188     `(dolist (v ,input)
    189        ,subexp)))
    190 
    191 (defun xml-query--compile (query input)
    192   (let ((tag (make-symbol "done")))
    193     `(catch ',tag
    194        ,(xml-query--compile-top query input `(throw ',tag v)))))
    195 
    196 (defmacro xml-query* (query sexp)
    197   "Like `xml-query' but generate code to execute QUERY on SEXP.
    198 
    199 Unlike `xml-query', QUERY must be a static, compile-time
    200 s-expression. See `xml-query-all*' for more information.
    201 
    202 QUERY is *not* evaluated, so it should not be quoted."
    203   (xml-query--compile query sexp))
    204 
    205 (defun xml-query-all--compile (query input)
    206   (let ((output (make-symbol "output")))
    207     `(let ((,output ()))
    208        ,(xml-query--compile-top query input `(push v ,output))
    209        (nreverse ,output))))
    210 
    211 (defmacro xml-query-all* (query sexp)
    212   "Like `xml-query-all' but generate code to execute QUERY on SEXP.
    213 
    214 Unlike `xml-query-all', QUERY must be a static, compile-time
    215 s-expression. This macro compiles the query into actual code. The
    216 result is faster since the query will be compiled into byte-code
    217 rather than \"interpreted\" at run time.
    218 
    219 Also unlike `xml-query-all', the parsed XML s-expression must
    220 also have its namespace pre-stripped. This is accomplished by
    221 setting the optional PARSE-NS argument of `xml-parse-region' to
    222 symbol-qnames.
    223 
    224 Sub-expression lists are not supported by this macro.
    225 
    226 QUERY is *not* evaluated, so it should not be quoted."
    227   (xml-query-all--compile query sexp))
    228 
    229 (provide 'xml-query)
    230 
    231 ;;; xml-query.el ends here