dotemacs

My Emacs configuration
git clone git://git.entf.net/dotemacs
Log | Files | Refs | LICENSE

esxml-query.el (28934B)


      1 ;;; esxml-query.el --- select esxml nodes jQuery-style
      2 
      3 ;; Copyright (C) 2017 Vasilij Schneidermann <mail@vasilij.de>
      4 
      5 ;; Author: Vasilij Schneidermann <mail@vasilij.de>
      6 ;; Maintainer: Vasilij Schneidermann
      7 ;; Version: 0.1.1
      8 ;; Keywords: data, lisp
      9 ;; Package-Requires: ((cl-lib "0.1"))
     10 ;;
     11 ;; This program is free software; you can redistribute it and/or
     12 ;; modify it under the terms of the GNU General Public License as
     13 ;; published by the Free Software Foundation, either version 3 of the
     14 ;; License, or (at your option) any later version.
     15 
     16 ;; This program is distributed in the hope that it will be useful,
     17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
     18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     19 ;; GNU General Public License for more details.
     20 
     21 ;; You should have received a copy of the GNU General Public License
     22 ;; along with this program.  If not, see <http://www.gnu.org/licenses/>.
     23 
     24 ;;; Commentary:
     25 
     26 ;; Traditionally people pick one of the following options when faced
     27 ;; with the task of extracting data from XML in Emacs Lisp:
     28 ;;
     29 ;; - Using regular expressions on the unparsed document
     30 ;; - Manual tree traversal with `assoc', `car' and `cdr'
     31 ;;
     32 ;; Browsers faced a similar problem until jQuery happened, shortly
     33 ;; afterwards they started providing the `node.querySelector' and
     34 ;; `node.querySelectorAll' API for retrieving one or all nodes
     35 ;; matching a given CSS selector.  This code implements the same API
     36 ;; with the `esxml-query' and `esxml-query-all' functions.  The
     37 ;; following table summarizes the currently supported modifiers and
     38 ;; combinators:
     39 ;;
     40 ;; | Name                               | Supported? | Syntax      |
     41 ;; |------------------------------------+------------+-------------|
     42 ;; | Namespaces                         | No         | foo|bar     |
     43 ;; | Commas                             | Yes        | foo, bar    |
     44 ;; | Descendant combinator              | Yes        | foo bar     |
     45 ;; | Child combinator                   | Yes        | foo>bar     |
     46 ;; | Adjacent sibling combinator        | No         | foo+bar     |
     47 ;; | General sibling combinator         | No         | foo~bar     |
     48 ;; | Universal selector                 | Yes        | *           |
     49 ;; | Type selector                      | Yes        | tag         |
     50 ;; | ID selector                        | Yes        | #foo        |
     51 ;; | Class selector                     | Yes        | .foo        |
     52 ;; | Attribute selector                 | Yes        | [foo]       |
     53 ;; | Exact match attribute selector     | Yes        | [foo=bar]   |
     54 ;; | Prefix match attribute selector    | Yes        | [foo^=bar]  |
     55 ;; | Suffix match attribute selector    | Yes        | [foo$=bar]  |
     56 ;; | Substring match attribute selector | Yes        | [foo*=bar]  |
     57 ;; | Include match attribute selector   | Yes        | [foo~=bar]  |
     58 ;; | Dash match attribute selector      | Yes        | [foo|=bar]  |
     59 ;; | Attribute selector modifiers       | No         | [foo=bar i] |
     60 ;; | Pseudo elements                    | No         | ::foo       |
     61 ;; | Pseudo classes                     | No         | :foo        |
     62 
     63 ;;; Code:
     64 
     65 (require 'cl-lib)
     66 
     67 
     68 ;;; CSS selector parsing
     69 
     70 ;; https://www.w3.org/TR/selectors/#w3cselgrammar
     71 ;; https://www.w3.org/TR/selectors4/#grammar
     72 ;; https://www.w3.org/TR/2003/WD-css3-syntax-20030813/#detailed-grammar
     73 ;; https://www.w3.org/TR/2003/WD-css3-syntax-20030813/#tokenization
     74 
     75 ;; you might be wondering why I'm using both level 3 and 4 standards,
     76 ;; well, the level 3 one has a buggy lexer section whereas level 4
     77 ;; omits crucial parser definitions, so both have to be used...
     78 
     79 ;; TODO: support :not
     80 (defvar esxml--css-selector-token-matchers
     81   (let* ((h "[0-9a-f]")
     82          (nl "\n\\|\r\n\\|\r\\|\f")
     83          (nonascii "[\200-\U0010ffff]")
     84          (unicode (format "\\\\%s\\{1,6\\}[ \t\r\n\f]?" h))
     85          (escape (format "\\(?:%s\\)\\|\\\\[ -~\200-\U0010ffff]" unicode))
     86          (nmstart (format "[a-z_]\\|%s\\|\\(?:%s\\)" nonascii escape))
     87          (nmchar (format "[a-z0-9_-]\\|%s\\|\\(?:%s\\)" nonascii escape))
     88          (num "[0-9]+\\|[0-9]*\\.[0-9]+")
     89          (string1 (format "\"\\(?:[\t !#$%%&(-~]\\|\\\\\\(?:%s\\)\\|'\\|%s\\|\\(?:%s\\)\\)*\"" nl nonascii escape))
     90          (string2 (format "'\\(?:[\t !#$%%&(-~]\\|\\\\\\(?:%s\\)\\|\"\\|%s\\|\\(?:%s\\)\\)*'" nl nonascii escape))
     91          (ident (format "[-]?\\(?:%s\\)\\(?:%s\\)*" nmstart nmchar))
     92          (unit (format "[-]?\\(?:%s\\)\\(?:%s\\)+" nmstart nmchar))
     93          (name (format "\\(?:%s\\)+" nmchar)))
     94 
     95     `((whitespace . "[ \t\r\n\f]+")
     96       (string . ,(format "\\(?:%s\\|%s\\)" string1 string2))
     97       (ident . ,ident)
     98       (hash . ,(format "#%s" name))
     99       (function . ,(format "%s(" ident))
    100       (number . ,num)
    101       (dimension . ,(format "\\(?:%s\\)%s" num unit))
    102       (prefix-match . "\\^=")
    103       (suffix-match . "\\$=")
    104       (substring-match . "\\*=")
    105       (include-match . "~=")
    106       (dash-match . "|=")
    107       (comma . ",")
    108       (gt . ">")
    109       (plus . "\\+")
    110       (minus . "-")
    111       (tilde . "~")
    112       (asterisk . "\\*")
    113       (period . "\\.")
    114       (equals . "=")
    115       (colon . ":")
    116       (lbracket . "\\[")
    117       (rbracket . "\\]")
    118       (rparen . ")"))))
    119 
    120 (defun esxml--tokenize-css-selector (string)
    121   (let (result)
    122     (with-temp-buffer
    123       (insert string)
    124       (goto-char (point-min))
    125       (while (not (eobp))
    126         (let ((max-length 0)
    127               longest)
    128           (dolist (matcher esxml--css-selector-token-matchers)
    129             (let ((id (car matcher))
    130                   (re (cdr matcher)))
    131               (when (looking-at re)
    132                 (let* ((token (match-string 0))
    133                        (length (length token)))
    134                   (when (> length max-length)
    135                     (setq max-length length)
    136                     (setq longest (cons id token)))))))
    137           (when (not longest)
    138             (error "Invalid token detected: %s"
    139                    (buffer-substring (point) (point-max))))
    140           (push longest result)
    141           (goto-char (+ (point) max-length)))))
    142     (nreverse result)))
    143 
    144 ;; the alternative is creating a mutable object with peek/next methods
    145 ;; and passing it around, so I chose the one requiring less typing, a
    146 ;; dynamically bound variable :<
    147 
    148 (defvar esxml--token-stream)
    149 
    150 ;; TODO: support :not
    151 ;; css-selector:
    152 ;;   css-selector-list;
    153 ;; css-selector-list:
    154 ;;   complex-css-selector [ comma whitespace* complex-css-selector ]*;
    155 ;; complex-css-selector:
    156 ;;   compound-css-selector [ css-combinator compound-css-selector ]* whitespace*;
    157 ;; css-combinator:
    158 ;;   whitespace+ | whitespace* [ '>' | '+' | '~' ] whitespace*;
    159 ;; compound-css-selector:
    160 ;;   css-type-selector css-modifier* | css-modifier+;
    161 ;; css-type-selector:
    162 ;;   IDENT | *;
    163 ;; css-modifier:
    164 ;;    css-id | css-class | css-attrib | css-pseudo;
    165 ;; css-id:
    166 ;;   HASH;
    167 ;; css-class:
    168 ;;   '.' IDENT;
    169 ;; css-attrib:
    170 ;;   '[' whitespace* css-attrib-name ']'
    171 ;;   | '[' whitespace* css-attrib-name css-attrib-match css-attrib-value whitespace* ']';
    172 ;; css-attrib-name:
    173 ;;   IDENT whitespace*;
    174 ;; css-attrib-match:
    175 ;;   [ '=' | PREFIX-MATCH | SUFFIX-MATCH | SUBSTRING-MATCH | INCLUDE-MATCH | DASH-MATCH ] whitespace*;
    176 ;; css-attrib-value:
    177 ;;   IDENT | STRING;
    178 ;; css-pseudo:
    179 ;;   ':' ':'? [ IDENT | css-functional-pseudo ];
    180 ;; css-functional-pseudo:
    181 ;;   FUNCTION whitespace* [ css-expression whitespace* ]+ ')';
    182 ;; css-expression:
    183 ;;   '+' | '-' | DIMENSION | NUMBER | STRING | IDENT
    184 
    185 (defun esxml-query-css-escape (string)
    186   "Returns escaped version of STRING for use in selectors.
    187 The logic used here corresponds to the CSS.escape API as
    188 specified in https://drafts.csswg.org/cssom/#the-css.escape()-method."
    189   (let (chars)
    190     (dotimes (i (length string))
    191       (let* ((char (aref string i))
    192              (unprintablep (or (and (>= char ?\u0001) (<= char ?\u001f))
    193                                (= char ?\u007f)))
    194              (nonasciip (>= char ?\u0080))
    195              (digitp (and (>= char ?\u0030) (<= char ?\u0039)))
    196              (upperp (and (>= char ?\u0041) (<= char ?\u005a)))
    197              (lowerp (and (>= char ?\u0061) (<= char ?\u007a))))
    198         (cond
    199          ((= char ?\u0000)
    200           (push ?\ufffd chars))
    201          (unprintablep
    202           (dolist (char (string-to-list (format "\\%x " char)))
    203             (push char chars)))
    204          ((and (= i 0) digitp)
    205           (dolist (char (string-to-list (format "\\%x " char)))
    206             (push char chars)))
    207          ((and (= i 1) digitp (= (aref string 0) ?-))
    208           (dolist (char (string-to-list (format "\\%x " char)))
    209             (push char chars)))
    210          ((and (= i 0) (= char ?-) (= (length string) 1))
    211           (push ?\\ chars)
    212           (push char chars))
    213          ((or nonasciip (= char ?-) (= char ?_) digitp upperp lowerp)
    214           (push char chars))
    215          (t
    216           (push ?\\ chars)
    217           (push char chars)))))
    218     (concat (nreverse chars))))
    219 
    220 (defun esxml--parse-css-identifier (string)
    221   ;; https://www.w3.org/TR/css-syntax-3/#consume-string-token
    222   (let* ((code-points (string-to-list string))
    223          chars
    224          token)
    225     (while code-points
    226       (let ((char (pop code-points)))
    227         (if (= char ?\\)
    228             (let ((char (pop code-points)))
    229               (cond
    230                ((not char))
    231                ((= char ?\n))
    232                ((or (and (>= char ?0) (<= char ?9))
    233                     (and (>= char ?a) (<= char ?f))
    234                     (and (>= char ?A) (<= char ?F)))
    235                 (let ((i 0)
    236                       (hex-chars (list char)))
    237                   (while (and (< i 5) code-points)
    238                     (let ((char (car code-points)))
    239                       (if (or (and (>= char ?0) (<= char ?9))
    240                               (and (>= char ?a) (<= char ?f))
    241                               (and (>= char ?A) (<= char ?F)))
    242                           (push (pop code-points) hex-chars)
    243                         (setq i 5)))
    244                     (setq i (1+ i)))
    245                   (let ((char (car code-points)))
    246                     (when (and char (= char ?\s))
    247                       (pop code-points)))
    248                   (let* ((hex-token (concat (nreverse hex-chars)))
    249                          (code-point (string-to-number hex-token 16)))
    250                     (if (or (zerop code-point)
    251                             (and (>= code-point ?\ud800) (<= code-point ?\udfff))
    252                             (> code-point ?\U0010ffff))
    253                         (push ?\ufffd chars)
    254                       (push code-point chars)))))
    255                (t ; unspecified: non-hex digit
    256                 (push char chars))))
    257           (push char chars))))
    258     (concat (nreverse chars))))
    259 
    260 (defun esxml--parse-css-string-literal (string)
    261   (esxml--parse-css-identifier (substring string 1 -1)))
    262 
    263 (defmacro esxml--with-parse-shorthands (&rest body)
    264   `(cl-macrolet ((peek () '(car esxml--token-stream))
    265                  (next () '(pop esxml--token-stream))
    266                  (accept (type) `(and (peek) (eq (car (peek)) ,type)
    267                                       (cdr (next))))
    268                  (eat-whitespace () '(while (accept 'whitespace))))
    269      ,@body))
    270 (def-edebug-spec esxml--with-parse-shorthands (body))
    271 
    272 (defun esxml-parse-css-selector (string)
    273   "Parse CSS selector STRING into a list of alists.
    274 Each alist represents a complex CSS selector.  The result can be
    275 passed to `esxml-query' and `esxml-query-all' as the selector
    276 argument."
    277   (let* ((esxml--token-stream (esxml--tokenize-css-selector string))
    278          (result (esxml--parse-css-selector-list)))
    279     (when esxml--token-stream
    280       (error "Trailing garbage: %s"
    281              (mapconcat 'cdr esxml--token-stream "")))
    282     result))
    283 
    284 (defun esxml--parse-css-selector-list ()
    285   (esxml--with-parse-shorthands
    286    (let ((first (esxml--parse-complex-css-selector))
    287          result)
    288      (when (not first)
    289        (error "Expected at least one selector"))
    290      (push first result)
    291 
    292      (while (accept 'comma)
    293        (eat-whitespace)
    294        (let ((selector (esxml--parse-complex-css-selector)))
    295          (when (not selector)
    296            (error "Expected selector after comma"))
    297          (push selector result)))
    298      (nreverse result))))
    299 
    300 (defun esxml--parse-complex-css-selector ()
    301   (esxml--with-parse-shorthands
    302    (let ((first (esxml--parse-compound-css-selector))
    303          result done)
    304      (when first
    305        (push first result)
    306 
    307        (while (not done)
    308          (let ((combinator (esxml--parse-css-combinator)))
    309            (if combinator
    310                (let ((compound (esxml--parse-compound-css-selector)))
    311                  (cond
    312                   (compound
    313                    (setq result (append (list compound combinator) result)))
    314                   ;; allow whitespace before comma
    315                   ((not (eq (car (peek)) 'comma))
    316                    (error "Trailing combinator"))))
    317              (setq done t))))
    318        (nreverse result)))))
    319 
    320 (defun esxml--parse-css-combinator ()
    321   (esxml--with-parse-shorthands
    322    ;; NOTE: whitespace-surrounded combinators are distinguished from
    323    ;; whitespace-only ones by checking whether there has been
    324    ;; whitespace followed by a non-blank combinator
    325    (let ((leading-whitespace-p (eq (car (peek)) 'whitespace))
    326          result)
    327      (eat-whitespace)
    328      (let ((type (car (peek))))
    329        (cond
    330         ((member type '(gt plus tilde))
    331          (next)
    332          (cond
    333           ((eq type 'gt)
    334            (setq result '((combinator . child))))
    335           ((eq type 'plus)
    336            (setq result '((combinator . direct-sibling))))
    337           ((eq type 'tilde)
    338            (setq result '((combinator . indirect-sibling)))))
    339          (eat-whitespace))
    340         (leading-whitespace-p
    341          (setq result '((combinator . descendant))))
    342         (t nil)))
    343      result)))
    344 
    345 (defun esxml--parse-compound-css-selector ()
    346   (esxml--with-parse-shorthands
    347    (let ((type-selector (esxml--parse-css-type-selector))
    348          done
    349          result)
    350      ;; NOTE: css-type-selector css-modifier* | css-modifier+; is
    351      ;; equivalent to: [ css-type-selector | css-modifier ] css-modifier*;
    352      (if type-selector
    353          (push type-selector result)
    354        (let ((modifier (esxml--parse-css-modifier)))
    355          (if modifier
    356              (push modifier result)
    357            ;; NOTE: this allows the trailing combinator error to be thrown
    358            (setq done t))))
    359 
    360      (while (not done)
    361        (let ((modifier (esxml--parse-css-modifier)))
    362          (if modifier
    363              (push modifier result)
    364            (setq done t))))
    365      (when (> (cl-count 'id result :key 'car) 1)
    366        (error "Only one id selector allowed per compound"))
    367      (nreverse result))))
    368 
    369 (defun esxml--parse-css-type-selector ()
    370   (esxml--with-parse-shorthands
    371    (let ((token (peek)))
    372      (cond
    373       ((eq (car token) 'ident)
    374        (next)
    375        (cons 'tag (intern (esxml--parse-css-identifier (cdr token)))))
    376       ((eq (car token) 'asterisk)
    377        (next)
    378        '(wildcard))
    379       (t nil)))))
    380 
    381 (defun esxml--parse-css-modifier ()
    382   (or (esxml--parse-css-id)
    383       (esxml--parse-css-class)
    384       (esxml--parse-css-attrib)
    385       (esxml--parse-css-pseudo)))
    386 
    387 (defun esxml--parse-css-id ()
    388   (esxml--with-parse-shorthands
    389    (let ((value (accept 'hash)))
    390      (when value
    391        (cons 'id (substring value 1))))))
    392 
    393 (defun esxml--parse-css-class ()
    394   (esxml--with-parse-shorthands
    395    (when (accept 'period)
    396      (let ((value (accept 'ident)))
    397        (if value
    398            (cons 'class value)
    399          (error "Expected identifier after period"))))))
    400 
    401 (defun esxml--parse-css-attrib ()
    402   (esxml--with-parse-shorthands
    403    (let (result)
    404      (when (accept 'lbracket)
    405        (eat-whitespace)
    406        (let ((name (esxml--parse-css-attrib-name)))
    407          (when (not name)
    408            (error "Expected attribute name"))
    409          (push (cons 'name (esxml--parse-css-identifier name)) result)
    410          (when (not (accept 'rbracket))
    411            (let ((match (esxml--parse-css-attrib-match)))
    412              (when (not match)
    413                (error "Expected attribute matcher"))
    414              (let ((value (esxml--parse-css-attrib-value)))
    415                (when (not value)
    416                  (error "Expected attribute value"))
    417                (eat-whitespace)
    418                (when (not (accept 'rbracket))
    419                  (error "Unterminated attribute"))
    420                (push (cons match value) result)))))
    421        (cons 'attribute (nreverse result))))))
    422 
    423 (defun esxml--parse-css-attrib-name ()
    424   (esxml--with-parse-shorthands
    425    (let ((name (accept 'ident)))
    426      (when name
    427        (eat-whitespace)
    428        name))))
    429 
    430 (defun esxml--parse-css-attrib-match ()
    431   (esxml--with-parse-shorthands
    432    (let (result)
    433      (cond
    434       ((accept 'equals)
    435        (setq result 'exact-match))
    436       ((accept 'prefix-match)
    437        (setq result 'prefix-match))
    438       ((accept 'suffix-match)
    439        (setq result 'suffix-match))
    440       ((accept 'substring-match)
    441        (setq result 'substring-match))
    442       ((accept 'include-match)
    443        (setq result 'include-match))
    444       ((accept 'dash-match)
    445        (setq result 'dash-match)))
    446      (eat-whitespace)
    447      result)))
    448 
    449 (defun esxml--parse-css-attrib-value ()
    450   (esxml--with-parse-shorthands
    451    (let ((token (peek)))
    452      (cond
    453       ((eq (car token) 'ident)
    454        (next)
    455        (esxml--parse-css-identifier (cdr token)))
    456       ((eq (car token) 'string)
    457        (next)
    458        (esxml--parse-css-string-literal (cdr token)))
    459       (t nil)))))
    460 
    461 (defun esxml--parse-css-pseudo ()
    462   (esxml--with-parse-shorthands
    463    (let (result type)
    464      (when (accept 'colon)
    465        (if (accept 'colon)
    466            (setq type 'pseudo-element)
    467          (setq type 'pseudo-class))
    468        (let ((functional (esxml--parse-css-functional-pseudo)))
    469          (if functional
    470              (if (eq type 'pseudo-class)
    471                  (let ((value (car functional))
    472                        (args (cdr functional)))
    473                    (push (cons 'name (esxml--parse-css-identifier value)) result)
    474                    (push (cons 'args args) result))
    475                (error "Pseudo-elements may not have arguments"))
    476            (let ((value (accept 'ident)))
    477              (if value
    478                  (push (cons 'name (esxml--parse-css-identifier value)) result)
    479                (error "Expected function or identifier")))))
    480        (cons type (nreverse result))))))
    481 
    482 (defun esxml--parse-css-functional-pseudo ()
    483   (esxml--with-parse-shorthands
    484    (let ((function (accept 'function))
    485          result)
    486      (when function
    487        (push (substring function 0 -1) result)
    488        (eat-whitespace)
    489        (let ((expression (esxml--parse-css-expression))
    490              done)
    491          (eat-whitespace)
    492          (when (not expression)
    493            (error "Expected at least one expression for function"))
    494          (push expression result)
    495          (while (not done)
    496            (setq expression (esxml--parse-css-expression))
    497            (if expression
    498                (progn
    499                  (push expression result)
    500                  (eat-whitespace))
    501              (setq done t))))
    502        (when (not (accept 'rparen))
    503          (error "Unterminated function argument list"))
    504        (nreverse result)))))
    505 
    506 (defun esxml--parse-css-expression ()
    507   (esxml--with-parse-shorthands
    508    (let ((token (peek)))
    509      (cond
    510       ((accept 'plus)
    511        '(operator . +))
    512       ((accept 'minus)
    513        '(operator . -))
    514       ((eq (car token) 'dimension)
    515        (next)
    516        (cons 'dimension (esxml--parse-css-identifier (cdr token))))
    517       ((eq (car token) 'number)
    518        (next)
    519        (cons 'number (string-to-number (cdr token))))
    520       ((eq (car token) 'string)
    521        (next)
    522        (cons 'string (esxml--parse-css-string-literal (cdr token))))
    523       ((eq (car token) 'ident)
    524        (next)
    525        (cons 'ident (esxml--parse-css-identifier (cdr token))))
    526       (t nil)))))
    527 
    528 
    529 ;;; tree traversal
    530 
    531 ;; TODO: these helpers should be part of esxml.el
    532 (defun esxml-branch-p (node)
    533   "Non-nil if NODE refers to an esxml branch."
    534   (and (listp node)
    535        (>= (length node) 2)
    536        (symbolp (car node))
    537        (listp (cadr node))))
    538 
    539 (defun esxml-node-tag (node)
    540   "Returns the tag of NODE if available."
    541   (and (esxml-branch-p node)
    542        (car node)))
    543 
    544 (defun esxml-node-attributes (node)
    545   "Returns the attributes of NODE if available."
    546   (and (esxml-branch-p node)
    547        (cadr node)))
    548 
    549 (defun esxml-node-attribute (attribute node)
    550   "Returns the attribute ATTRIBUTE of NODE if available."
    551   (and (esxml-branch-p node)
    552        (cdr (assq attribute (cadr node)))))
    553 
    554 (defun esxml-node-children (node)
    555   "Returns the children of NODE if available."
    556   (and (esxml-branch-p node)
    557        (nthcdr 2 node)))
    558 
    559 (defun esxml-find-node (pred root)
    560   "Locates a node satisfying PRED starting from ROOT.
    561 Returns the node or nil if none found."
    562   (if (funcall pred root)
    563       root
    564     (cl-some (lambda (node) (esxml-find-node pred node))
    565              (esxml-node-children root))))
    566 
    567 (defun esxml-visit-nodes (function root)
    568   "Visit nodes by calling FUNCTION on each starting from ROOT."
    569   (funcall function root)
    570   (mapc (lambda (node) (esxml-visit-nodes function node))
    571         (esxml-node-children root)))
    572 
    573 (defun esxml-find-nodes (pred root)
    574   "Locates all nodes satisfying PRED starting from ROOT.
    575 Returns a list of the nodes or nil if none found."
    576   (let ((acc '()))
    577     (esxml-visit-nodes
    578      (lambda (node)
    579        (when (funcall pred node)
    580          (push node acc)))
    581      root)
    582     (nreverse acc)))
    583 
    584 (defun esxml-find-descendant (pred root)
    585   "Locates a node satisfying PRED starting from ROOT's children.
    586 Returns the node or nil if none found."
    587   (cl-some (lambda (node) (esxml-find-node pred node))
    588            (esxml-node-children root)))
    589 
    590 (defun esxml-find-descendants (pred root)
    591   "Locates all nodes satisfying PRED starting from ROOT's children.
    592 Returns a list of the nodes or nil if none found."
    593   (cl-mapcan (lambda (node) (esxml-find-nodes pred node))
    594              (esxml-node-children root)))
    595 
    596 (defun esxml-find-child (pred root)
    597   "Locates a node satisfying PRED among ROOT's children.
    598 Returns the node or nil if none found."
    599   (cl-some (lambda (node) (when (funcall pred node) node))
    600            (esxml-node-children root)))
    601 
    602 (defun esxml-find-children (pred root)
    603   "Locates all nodes satisfying PRED among ROOT's children.
    604 Returns a list of the nodes or nil if none found."
    605   (mapcar (lambda (node) (when (funcall pred node) node))
    606           (esxml-node-children root)))
    607 
    608 (defun esxml--node-with-children (node children)
    609   (let ((tag (esxml-node-tag node))
    610         (attributes (esxml-node-attributes node)))
    611     (append (list tag attributes) children)))
    612 
    613 (defun esxml--node-with-attributes (node attributes)
    614   (let ((tag (esxml-node-tag node))
    615         (children (esxml-node-children node)))
    616     (append (list tag attributes) children)))
    617 
    618 (defun esxml-tree-map (function root)
    619   "Returns a copy of ROOT with FUNCTION applied to each node."
    620   (if (esxml-branch-p root)
    621       (esxml--node-with-children
    622        (funcall function root)
    623        (mapcar (lambda (node) (esxml-tree-map function node))
    624                (esxml-node-children root)))
    625     (funcall function root)))
    626 
    627 (defvar esxml--symbol (make-symbol "id"))
    628 
    629 (defun esxml--decorate-tree (root)
    630   (let ((i 0))
    631     (esxml-tree-map
    632      (lambda (node)
    633        (let ((attribute (cons esxml--symbol i))
    634              (attributes (esxml-node-attributes node)))
    635          (setq attributes (append (list attribute) attributes))
    636          (setq i (1+ i))
    637          (if (esxml-branch-p node)
    638              (esxml--node-with-attributes node attributes)
    639            node)))
    640      root)))
    641 
    642 (defun esxml--undecorate-node (node)
    643   (if (esxml-branch-p node)
    644       (let ((attributes (esxml-node-attributes node)))
    645         (esxml--node-with-attributes node (assq-delete-all esxml--symbol
    646                                                            attributes)))
    647     node))
    648 
    649 (defun esxml--retrieve-decoration (node)
    650   (esxml-node-attribute esxml--symbol node))
    651 
    652 
    653 ;;; querying
    654 
    655 ;; NOTE: supporting structural pseudo functions, direct siblings and
    656 ;; indirect siblings requires breadth instead of depth traversal,
    657 ;; something that could be emulated without zippers if you had the
    658 ;; parent of the node (and the position of the child)...
    659 
    660 (defun esxml--node-matches-attribute-p (node modifier)
    661   (let ((attributes (esxml-node-attributes node))
    662         haystack)
    663     (cl-every
    664      (lambda (item)
    665        (let ((type (car item))
    666              (value (cdr item)))
    667          (cond
    668           ((eq type 'name)
    669            (let ((match (assq (intern value) attributes)))
    670              (setq haystack (cdr match))
    671              match))
    672           ((eq type 'exact-match)
    673            (equal haystack value))
    674           ((eq type 'prefix-match)
    675            (string-prefix-p value haystack))
    676           ((eq type 'suffix-match)
    677            (string-suffix-p value haystack))
    678           ((eq type 'substring-match)
    679            (string-match-p (regexp-quote value) haystack))
    680           ((eq type 'include-match)
    681            (member value (split-string haystack " ")))
    682           ((eq type 'dash-match)
    683            (or (equal value haystack)
    684                (string-match-p (format "^%s-" (regexp-quote value)) haystack)))
    685           (t (error "Unknown attribute modifier")))))
    686      modifier)))
    687 
    688 (defun esxml--node-matches-modifier-p (node type value)
    689   (cond
    690    ((eq type 'wildcard)
    691     t)
    692    ((eq type 'tag)
    693     (equal (esxml-node-tag node) value))
    694    ((eq type 'id)
    695     (equal (esxml-node-attribute 'id node) value))
    696    ((eq type 'class)
    697     (let ((class (esxml-node-attribute 'class node)))
    698       (and class (member value (split-string class " ")))))
    699    ((eq type 'attribute)
    700     (esxml--node-matches-attribute-p node value))
    701    ;; TODO: support structural pseudo functions
    702    ;; TODO: error out on invalid pseudo-class arguments
    703    (t (error "Unimplemented attribute type: %s" type))))
    704 
    705 (defun esxml--find-node-for (attributes)
    706   (lambda (node)
    707     (cl-every
    708      (lambda (attribute)
    709        (let ((type (car attribute))
    710              (value (cdr attribute)))
    711          (esxml--node-matches-modifier-p node type value)))
    712      attributes)))
    713 
    714 (defun esxml--find-nodes (root combinator attributes)
    715   (let* ((type (cdr (assq 'combinator combinator)))
    716          (walker (cond
    717                  ((not type)
    718                   'esxml-find-nodes)
    719                  ((eq type 'descendant)
    720                   'esxml-find-descendants)
    721                  ((eq type 'child)
    722                   'esxml-find-children)
    723                  ;; TODO: support direct sibling
    724                  ;; TODO: support indirect sibling
    725                  (t (error "Unimplemented combinator %s" combinator)))))
    726     (funcall walker (esxml--find-node-for attributes) root)))
    727 
    728 (defun esxml--query (selector root)
    729   (let* ((attributes (pop selector))
    730          combinator
    731          (result (esxml--find-nodes root nil attributes)))
    732     (while (and result selector)
    733       (setq combinator (pop selector))
    734       (setq attributes (pop selector))
    735       (setq result (cl-mapcan
    736                     (lambda (node)
    737                       (esxml--find-nodes node combinator attributes))
    738                     result))
    739       (setq result (delq nil result)))
    740     result))
    741 
    742 (defun esxml--delete-dups (items test)
    743   (let ((seen (make-hash-table :test test))
    744         result)
    745     (while items
    746       (let ((item (pop items)))
    747         (when (not (gethash item seen))
    748           (push item result)
    749           (puthash item t seen))))
    750     (nreverse result)))
    751 
    752 (defun esxml-query-all (selector root)
    753   "Locates all nodes satisfying SELECTOR starting from ROOT.
    754 SELECTOR must be a string containing a CSS selector or a parsed
    755 CSS selector returned by `esxml-parse-css-selector'.  Returns a
    756 list of the nodes or nil if none found."
    757   (when (stringp selector)
    758     (setq selector (esxml-parse-css-selector selector)))
    759   (if (= (length selector) 1)
    760       ;; no commas, we can only get the same nodes repeatedly
    761       (esxml--delete-dups (esxml--query (car selector) root) 'eq)
    762     ;; commas, nodes might be the same *and* in the wrong order
    763     (setq root (esxml--decorate-tree root))
    764     (let (result)
    765       (while selector
    766         (setq result (nconc result (esxml--query (pop selector) root))))
    767       (setq result (cl-sort result '< :key 'esxml--retrieve-decoration))
    768       (setq result (cl-delete-duplicates result :test '=
    769                                          :key 'esxml--retrieve-decoration))
    770       (mapcar (lambda (node) (esxml--undecorate-node node)) result))))
    771 
    772 (defun esxml-query (selector root)
    773   "Locates a node satisfying SELECTOR starting from ROOT.
    774 SELECTOR must be a string containing a CSS selector or a parsed
    775 CSS selector returned by `esxml-parse-css-selector'.  Returns the
    776 node or nil if none found."
    777   ;; NOTE: you can do a bit less work (the savings decrease the more
    778   ;; branches the query discards), but it's simpler and safer to just
    779   ;; have the same algorithm for both entry points
    780   (car (esxml-query-all selector root)))
    781 
    782 (provide 'esxml-query)
    783 ;;; esxml-query.el ends here