esxml-query.el (28934B)
1 ;;; esxml-query.el --- select esxml nodes jQuery-style 2 3 ;; Copyright (C) 2017 Vasilij Schneidermann <mail@vasilij.de> 4 5 ;; Author: Vasilij Schneidermann <mail@vasilij.de> 6 ;; Maintainer: Vasilij Schneidermann 7 ;; Version: 0.1.1 8 ;; Keywords: data, lisp 9 ;; Package-Requires: ((cl-lib "0.1")) 10 ;; 11 ;; This program is free software; you can redistribute it and/or 12 ;; modify it under the terms of the GNU General Public License as 13 ;; published by the Free Software Foundation, either version 3 of the 14 ;; License, or (at your option) any later version. 15 16 ;; This program is distributed in the hope that it will be useful, 17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of 18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 ;; GNU General Public License for more details. 20 21 ;; You should have received a copy of the GNU General Public License 22 ;; along with this program. If not, see <http://www.gnu.org/licenses/>. 23 24 ;;; Commentary: 25 26 ;; Traditionally people pick one of the following options when faced 27 ;; with the task of extracting data from XML in Emacs Lisp: 28 ;; 29 ;; - Using regular expressions on the unparsed document 30 ;; - Manual tree traversal with `assoc', `car' and `cdr' 31 ;; 32 ;; Browsers faced a similar problem until jQuery happened, shortly 33 ;; afterwards they started providing the `node.querySelector' and 34 ;; `node.querySelectorAll' API for retrieving one or all nodes 35 ;; matching a given CSS selector. This code implements the same API 36 ;; with the `esxml-query' and `esxml-query-all' functions. The 37 ;; following table summarizes the currently supported modifiers and 38 ;; combinators: 39 ;; 40 ;; | Name | Supported? | Syntax | 41 ;; |------------------------------------+------------+-------------| 42 ;; | Namespaces | No | foo|bar | 43 ;; | Commas | Yes | foo, bar | 44 ;; | Descendant combinator | Yes | foo bar | 45 ;; | Child combinator | Yes | foo>bar | 46 ;; | Adjacent sibling combinator | No | foo+bar | 47 ;; | General sibling combinator | No | foo~bar | 48 ;; | Universal selector | Yes | * | 49 ;; | Type selector | Yes | tag | 50 ;; | ID selector | Yes | #foo | 51 ;; | Class selector | Yes | .foo | 52 ;; | Attribute selector | Yes | [foo] | 53 ;; | Exact match attribute selector | Yes | [foo=bar] | 54 ;; | Prefix match attribute selector | Yes | [foo^=bar] | 55 ;; | Suffix match attribute selector | Yes | [foo$=bar] | 56 ;; | Substring match attribute selector | Yes | [foo*=bar] | 57 ;; | Include match attribute selector | Yes | [foo~=bar] | 58 ;; | Dash match attribute selector | Yes | [foo|=bar] | 59 ;; | Attribute selector modifiers | No | [foo=bar i] | 60 ;; | Pseudo elements | No | ::foo | 61 ;; | Pseudo classes | No | :foo | 62 63 ;;; Code: 64 65 (require 'cl-lib) 66 67 68 ;;; CSS selector parsing 69 70 ;; https://www.w3.org/TR/selectors/#w3cselgrammar 71 ;; https://www.w3.org/TR/selectors4/#grammar 72 ;; https://www.w3.org/TR/2003/WD-css3-syntax-20030813/#detailed-grammar 73 ;; https://www.w3.org/TR/2003/WD-css3-syntax-20030813/#tokenization 74 75 ;; you might be wondering why I'm using both level 3 and 4 standards, 76 ;; well, the level 3 one has a buggy lexer section whereas level 4 77 ;; omits crucial parser definitions, so both have to be used... 78 79 ;; TODO: support :not 80 (defvar esxml--css-selector-token-matchers 81 (let* ((h "[0-9a-f]") 82 (nl "\n\\|\r\n\\|\r\\|\f") 83 (nonascii "[\200-\U0010ffff]") 84 (unicode (format "\\\\%s\\{1,6\\}[ \t\r\n\f]?" h)) 85 (escape (format "\\(?:%s\\)\\|\\\\[ -~\200-\U0010ffff]" unicode)) 86 (nmstart (format "[a-z_]\\|%s\\|\\(?:%s\\)" nonascii escape)) 87 (nmchar (format "[a-z0-9_-]\\|%s\\|\\(?:%s\\)" nonascii escape)) 88 (num "[0-9]+\\|[0-9]*\\.[0-9]+") 89 (string1 (format "\"\\(?:[\t !#$%%&(-~]\\|\\\\\\(?:%s\\)\\|'\\|%s\\|\\(?:%s\\)\\)*\"" nl nonascii escape)) 90 (string2 (format "'\\(?:[\t !#$%%&(-~]\\|\\\\\\(?:%s\\)\\|\"\\|%s\\|\\(?:%s\\)\\)*'" nl nonascii escape)) 91 (ident (format "[-]?\\(?:%s\\)\\(?:%s\\)*" nmstart nmchar)) 92 (unit (format "[-]?\\(?:%s\\)\\(?:%s\\)+" nmstart nmchar)) 93 (name (format "\\(?:%s\\)+" nmchar))) 94 95 `((whitespace . "[ \t\r\n\f]+") 96 (string . ,(format "\\(?:%s\\|%s\\)" string1 string2)) 97 (ident . ,ident) 98 (hash . ,(format "#%s" name)) 99 (function . ,(format "%s(" ident)) 100 (number . ,num) 101 (dimension . ,(format "\\(?:%s\\)%s" num unit)) 102 (prefix-match . "\\^=") 103 (suffix-match . "\\$=") 104 (substring-match . "\\*=") 105 (include-match . "~=") 106 (dash-match . "|=") 107 (comma . ",") 108 (gt . ">") 109 (plus . "\\+") 110 (minus . "-") 111 (tilde . "~") 112 (asterisk . "\\*") 113 (period . "\\.") 114 (equals . "=") 115 (colon . ":") 116 (lbracket . "\\[") 117 (rbracket . "\\]") 118 (rparen . ")")))) 119 120 (defun esxml--tokenize-css-selector (string) 121 (let (result) 122 (with-temp-buffer 123 (insert string) 124 (goto-char (point-min)) 125 (while (not (eobp)) 126 (let ((max-length 0) 127 longest) 128 (dolist (matcher esxml--css-selector-token-matchers) 129 (let ((id (car matcher)) 130 (re (cdr matcher))) 131 (when (looking-at re) 132 (let* ((token (match-string 0)) 133 (length (length token))) 134 (when (> length max-length) 135 (setq max-length length) 136 (setq longest (cons id token))))))) 137 (when (not longest) 138 (error "Invalid token detected: %s" 139 (buffer-substring (point) (point-max)))) 140 (push longest result) 141 (goto-char (+ (point) max-length))))) 142 (nreverse result))) 143 144 ;; the alternative is creating a mutable object with peek/next methods 145 ;; and passing it around, so I chose the one requiring less typing, a 146 ;; dynamically bound variable :< 147 148 (defvar esxml--token-stream) 149 150 ;; TODO: support :not 151 ;; css-selector: 152 ;; css-selector-list; 153 ;; css-selector-list: 154 ;; complex-css-selector [ comma whitespace* complex-css-selector ]*; 155 ;; complex-css-selector: 156 ;; compound-css-selector [ css-combinator compound-css-selector ]* whitespace*; 157 ;; css-combinator: 158 ;; whitespace+ | whitespace* [ '>' | '+' | '~' ] whitespace*; 159 ;; compound-css-selector: 160 ;; css-type-selector css-modifier* | css-modifier+; 161 ;; css-type-selector: 162 ;; IDENT | *; 163 ;; css-modifier: 164 ;; css-id | css-class | css-attrib | css-pseudo; 165 ;; css-id: 166 ;; HASH; 167 ;; css-class: 168 ;; '.' IDENT; 169 ;; css-attrib: 170 ;; '[' whitespace* css-attrib-name ']' 171 ;; | '[' whitespace* css-attrib-name css-attrib-match css-attrib-value whitespace* ']'; 172 ;; css-attrib-name: 173 ;; IDENT whitespace*; 174 ;; css-attrib-match: 175 ;; [ '=' | PREFIX-MATCH | SUFFIX-MATCH | SUBSTRING-MATCH | INCLUDE-MATCH | DASH-MATCH ] whitespace*; 176 ;; css-attrib-value: 177 ;; IDENT | STRING; 178 ;; css-pseudo: 179 ;; ':' ':'? [ IDENT | css-functional-pseudo ]; 180 ;; css-functional-pseudo: 181 ;; FUNCTION whitespace* [ css-expression whitespace* ]+ ')'; 182 ;; css-expression: 183 ;; '+' | '-' | DIMENSION | NUMBER | STRING | IDENT 184 185 (defun esxml-query-css-escape (string) 186 "Returns escaped version of STRING for use in selectors. 187 The logic used here corresponds to the CSS.escape API as 188 specified in https://drafts.csswg.org/cssom/#the-css.escape()-method." 189 (let (chars) 190 (dotimes (i (length string)) 191 (let* ((char (aref string i)) 192 (unprintablep (or (and (>= char ?\u0001) (<= char ?\u001f)) 193 (= char ?\u007f))) 194 (nonasciip (>= char ?\u0080)) 195 (digitp (and (>= char ?\u0030) (<= char ?\u0039))) 196 (upperp (and (>= char ?\u0041) (<= char ?\u005a))) 197 (lowerp (and (>= char ?\u0061) (<= char ?\u007a)))) 198 (cond 199 ((= char ?\u0000) 200 (push ?\ufffd chars)) 201 (unprintablep 202 (dolist (char (string-to-list (format "\\%x " char))) 203 (push char chars))) 204 ((and (= i 0) digitp) 205 (dolist (char (string-to-list (format "\\%x " char))) 206 (push char chars))) 207 ((and (= i 1) digitp (= (aref string 0) ?-)) 208 (dolist (char (string-to-list (format "\\%x " char))) 209 (push char chars))) 210 ((and (= i 0) (= char ?-) (= (length string) 1)) 211 (push ?\\ chars) 212 (push char chars)) 213 ((or nonasciip (= char ?-) (= char ?_) digitp upperp lowerp) 214 (push char chars)) 215 (t 216 (push ?\\ chars) 217 (push char chars))))) 218 (concat (nreverse chars)))) 219 220 (defun esxml--parse-css-identifier (string) 221 ;; https://www.w3.org/TR/css-syntax-3/#consume-string-token 222 (let* ((code-points (string-to-list string)) 223 chars 224 token) 225 (while code-points 226 (let ((char (pop code-points))) 227 (if (= char ?\\) 228 (let ((char (pop code-points))) 229 (cond 230 ((not char)) 231 ((= char ?\n)) 232 ((or (and (>= char ?0) (<= char ?9)) 233 (and (>= char ?a) (<= char ?f)) 234 (and (>= char ?A) (<= char ?F))) 235 (let ((i 0) 236 (hex-chars (list char))) 237 (while (and (< i 5) code-points) 238 (let ((char (car code-points))) 239 (if (or (and (>= char ?0) (<= char ?9)) 240 (and (>= char ?a) (<= char ?f)) 241 (and (>= char ?A) (<= char ?F))) 242 (push (pop code-points) hex-chars) 243 (setq i 5))) 244 (setq i (1+ i))) 245 (let ((char (car code-points))) 246 (when (and char (= char ?\s)) 247 (pop code-points))) 248 (let* ((hex-token (concat (nreverse hex-chars))) 249 (code-point (string-to-number hex-token 16))) 250 (if (or (zerop code-point) 251 (and (>= code-point ?\ud800) (<= code-point ?\udfff)) 252 (> code-point ?\U0010ffff)) 253 (push ?\ufffd chars) 254 (push code-point chars))))) 255 (t ; unspecified: non-hex digit 256 (push char chars)))) 257 (push char chars)))) 258 (concat (nreverse chars)))) 259 260 (defun esxml--parse-css-string-literal (string) 261 (esxml--parse-css-identifier (substring string 1 -1))) 262 263 (defmacro esxml--with-parse-shorthands (&rest body) 264 `(cl-macrolet ((peek () '(car esxml--token-stream)) 265 (next () '(pop esxml--token-stream)) 266 (accept (type) `(and (peek) (eq (car (peek)) ,type) 267 (cdr (next)))) 268 (eat-whitespace () '(while (accept 'whitespace)))) 269 ,@body)) 270 (def-edebug-spec esxml--with-parse-shorthands (body)) 271 272 (defun esxml-parse-css-selector (string) 273 "Parse CSS selector STRING into a list of alists. 274 Each alist represents a complex CSS selector. The result can be 275 passed to `esxml-query' and `esxml-query-all' as the selector 276 argument." 277 (let* ((esxml--token-stream (esxml--tokenize-css-selector string)) 278 (result (esxml--parse-css-selector-list))) 279 (when esxml--token-stream 280 (error "Trailing garbage: %s" 281 (mapconcat 'cdr esxml--token-stream ""))) 282 result)) 283 284 (defun esxml--parse-css-selector-list () 285 (esxml--with-parse-shorthands 286 (let ((first (esxml--parse-complex-css-selector)) 287 result) 288 (when (not first) 289 (error "Expected at least one selector")) 290 (push first result) 291 292 (while (accept 'comma) 293 (eat-whitespace) 294 (let ((selector (esxml--parse-complex-css-selector))) 295 (when (not selector) 296 (error "Expected selector after comma")) 297 (push selector result))) 298 (nreverse result)))) 299 300 (defun esxml--parse-complex-css-selector () 301 (esxml--with-parse-shorthands 302 (let ((first (esxml--parse-compound-css-selector)) 303 result done) 304 (when first 305 (push first result) 306 307 (while (not done) 308 (let ((combinator (esxml--parse-css-combinator))) 309 (if combinator 310 (let ((compound (esxml--parse-compound-css-selector))) 311 (cond 312 (compound 313 (setq result (append (list compound combinator) result))) 314 ;; allow whitespace before comma 315 ((not (eq (car (peek)) 'comma)) 316 (error "Trailing combinator")))) 317 (setq done t)))) 318 (nreverse result))))) 319 320 (defun esxml--parse-css-combinator () 321 (esxml--with-parse-shorthands 322 ;; NOTE: whitespace-surrounded combinators are distinguished from 323 ;; whitespace-only ones by checking whether there has been 324 ;; whitespace followed by a non-blank combinator 325 (let ((leading-whitespace-p (eq (car (peek)) 'whitespace)) 326 result) 327 (eat-whitespace) 328 (let ((type (car (peek)))) 329 (cond 330 ((member type '(gt plus tilde)) 331 (next) 332 (cond 333 ((eq type 'gt) 334 (setq result '((combinator . child)))) 335 ((eq type 'plus) 336 (setq result '((combinator . direct-sibling)))) 337 ((eq type 'tilde) 338 (setq result '((combinator . indirect-sibling))))) 339 (eat-whitespace)) 340 (leading-whitespace-p 341 (setq result '((combinator . descendant)))) 342 (t nil))) 343 result))) 344 345 (defun esxml--parse-compound-css-selector () 346 (esxml--with-parse-shorthands 347 (let ((type-selector (esxml--parse-css-type-selector)) 348 done 349 result) 350 ;; NOTE: css-type-selector css-modifier* | css-modifier+; is 351 ;; equivalent to: [ css-type-selector | css-modifier ] css-modifier*; 352 (if type-selector 353 (push type-selector result) 354 (let ((modifier (esxml--parse-css-modifier))) 355 (if modifier 356 (push modifier result) 357 ;; NOTE: this allows the trailing combinator error to be thrown 358 (setq done t)))) 359 360 (while (not done) 361 (let ((modifier (esxml--parse-css-modifier))) 362 (if modifier 363 (push modifier result) 364 (setq done t)))) 365 (when (> (cl-count 'id result :key 'car) 1) 366 (error "Only one id selector allowed per compound")) 367 (nreverse result)))) 368 369 (defun esxml--parse-css-type-selector () 370 (esxml--with-parse-shorthands 371 (let ((token (peek))) 372 (cond 373 ((eq (car token) 'ident) 374 (next) 375 (cons 'tag (intern (esxml--parse-css-identifier (cdr token))))) 376 ((eq (car token) 'asterisk) 377 (next) 378 '(wildcard)) 379 (t nil))))) 380 381 (defun esxml--parse-css-modifier () 382 (or (esxml--parse-css-id) 383 (esxml--parse-css-class) 384 (esxml--parse-css-attrib) 385 (esxml--parse-css-pseudo))) 386 387 (defun esxml--parse-css-id () 388 (esxml--with-parse-shorthands 389 (let ((value (accept 'hash))) 390 (when value 391 (cons 'id (substring value 1)))))) 392 393 (defun esxml--parse-css-class () 394 (esxml--with-parse-shorthands 395 (when (accept 'period) 396 (let ((value (accept 'ident))) 397 (if value 398 (cons 'class value) 399 (error "Expected identifier after period")))))) 400 401 (defun esxml--parse-css-attrib () 402 (esxml--with-parse-shorthands 403 (let (result) 404 (when (accept 'lbracket) 405 (eat-whitespace) 406 (let ((name (esxml--parse-css-attrib-name))) 407 (when (not name) 408 (error "Expected attribute name")) 409 (push (cons 'name (esxml--parse-css-identifier name)) result) 410 (when (not (accept 'rbracket)) 411 (let ((match (esxml--parse-css-attrib-match))) 412 (when (not match) 413 (error "Expected attribute matcher")) 414 (let ((value (esxml--parse-css-attrib-value))) 415 (when (not value) 416 (error "Expected attribute value")) 417 (eat-whitespace) 418 (when (not (accept 'rbracket)) 419 (error "Unterminated attribute")) 420 (push (cons match value) result))))) 421 (cons 'attribute (nreverse result)))))) 422 423 (defun esxml--parse-css-attrib-name () 424 (esxml--with-parse-shorthands 425 (let ((name (accept 'ident))) 426 (when name 427 (eat-whitespace) 428 name)))) 429 430 (defun esxml--parse-css-attrib-match () 431 (esxml--with-parse-shorthands 432 (let (result) 433 (cond 434 ((accept 'equals) 435 (setq result 'exact-match)) 436 ((accept 'prefix-match) 437 (setq result 'prefix-match)) 438 ((accept 'suffix-match) 439 (setq result 'suffix-match)) 440 ((accept 'substring-match) 441 (setq result 'substring-match)) 442 ((accept 'include-match) 443 (setq result 'include-match)) 444 ((accept 'dash-match) 445 (setq result 'dash-match))) 446 (eat-whitespace) 447 result))) 448 449 (defun esxml--parse-css-attrib-value () 450 (esxml--with-parse-shorthands 451 (let ((token (peek))) 452 (cond 453 ((eq (car token) 'ident) 454 (next) 455 (esxml--parse-css-identifier (cdr token))) 456 ((eq (car token) 'string) 457 (next) 458 (esxml--parse-css-string-literal (cdr token))) 459 (t nil))))) 460 461 (defun esxml--parse-css-pseudo () 462 (esxml--with-parse-shorthands 463 (let (result type) 464 (when (accept 'colon) 465 (if (accept 'colon) 466 (setq type 'pseudo-element) 467 (setq type 'pseudo-class)) 468 (let ((functional (esxml--parse-css-functional-pseudo))) 469 (if functional 470 (if (eq type 'pseudo-class) 471 (let ((value (car functional)) 472 (args (cdr functional))) 473 (push (cons 'name (esxml--parse-css-identifier value)) result) 474 (push (cons 'args args) result)) 475 (error "Pseudo-elements may not have arguments")) 476 (let ((value (accept 'ident))) 477 (if value 478 (push (cons 'name (esxml--parse-css-identifier value)) result) 479 (error "Expected function or identifier"))))) 480 (cons type (nreverse result)))))) 481 482 (defun esxml--parse-css-functional-pseudo () 483 (esxml--with-parse-shorthands 484 (let ((function (accept 'function)) 485 result) 486 (when function 487 (push (substring function 0 -1) result) 488 (eat-whitespace) 489 (let ((expression (esxml--parse-css-expression)) 490 done) 491 (eat-whitespace) 492 (when (not expression) 493 (error "Expected at least one expression for function")) 494 (push expression result) 495 (while (not done) 496 (setq expression (esxml--parse-css-expression)) 497 (if expression 498 (progn 499 (push expression result) 500 (eat-whitespace)) 501 (setq done t)))) 502 (when (not (accept 'rparen)) 503 (error "Unterminated function argument list")) 504 (nreverse result))))) 505 506 (defun esxml--parse-css-expression () 507 (esxml--with-parse-shorthands 508 (let ((token (peek))) 509 (cond 510 ((accept 'plus) 511 '(operator . +)) 512 ((accept 'minus) 513 '(operator . -)) 514 ((eq (car token) 'dimension) 515 (next) 516 (cons 'dimension (esxml--parse-css-identifier (cdr token)))) 517 ((eq (car token) 'number) 518 (next) 519 (cons 'number (string-to-number (cdr token)))) 520 ((eq (car token) 'string) 521 (next) 522 (cons 'string (esxml--parse-css-string-literal (cdr token)))) 523 ((eq (car token) 'ident) 524 (next) 525 (cons 'ident (esxml--parse-css-identifier (cdr token)))) 526 (t nil))))) 527 528 529 ;;; tree traversal 530 531 ;; TODO: these helpers should be part of esxml.el 532 (defun esxml-branch-p (node) 533 "Non-nil if NODE refers to an esxml branch." 534 (and (listp node) 535 (>= (length node) 2) 536 (symbolp (car node)) 537 (listp (cadr node)))) 538 539 (defun esxml-node-tag (node) 540 "Returns the tag of NODE if available." 541 (and (esxml-branch-p node) 542 (car node))) 543 544 (defun esxml-node-attributes (node) 545 "Returns the attributes of NODE if available." 546 (and (esxml-branch-p node) 547 (cadr node))) 548 549 (defun esxml-node-attribute (attribute node) 550 "Returns the attribute ATTRIBUTE of NODE if available." 551 (and (esxml-branch-p node) 552 (cdr (assq attribute (cadr node))))) 553 554 (defun esxml-node-children (node) 555 "Returns the children of NODE if available." 556 (and (esxml-branch-p node) 557 (nthcdr 2 node))) 558 559 (defun esxml-find-node (pred root) 560 "Locates a node satisfying PRED starting from ROOT. 561 Returns the node or nil if none found." 562 (if (funcall pred root) 563 root 564 (cl-some (lambda (node) (esxml-find-node pred node)) 565 (esxml-node-children root)))) 566 567 (defun esxml-visit-nodes (function root) 568 "Visit nodes by calling FUNCTION on each starting from ROOT." 569 (funcall function root) 570 (mapc (lambda (node) (esxml-visit-nodes function node)) 571 (esxml-node-children root))) 572 573 (defun esxml-find-nodes (pred root) 574 "Locates all nodes satisfying PRED starting from ROOT. 575 Returns a list of the nodes or nil if none found." 576 (let ((acc '())) 577 (esxml-visit-nodes 578 (lambda (node) 579 (when (funcall pred node) 580 (push node acc))) 581 root) 582 (nreverse acc))) 583 584 (defun esxml-find-descendant (pred root) 585 "Locates a node satisfying PRED starting from ROOT's children. 586 Returns the node or nil if none found." 587 (cl-some (lambda (node) (esxml-find-node pred node)) 588 (esxml-node-children root))) 589 590 (defun esxml-find-descendants (pred root) 591 "Locates all nodes satisfying PRED starting from ROOT's children. 592 Returns a list of the nodes or nil if none found." 593 (cl-mapcan (lambda (node) (esxml-find-nodes pred node)) 594 (esxml-node-children root))) 595 596 (defun esxml-find-child (pred root) 597 "Locates a node satisfying PRED among ROOT's children. 598 Returns the node or nil if none found." 599 (cl-some (lambda (node) (when (funcall pred node) node)) 600 (esxml-node-children root))) 601 602 (defun esxml-find-children (pred root) 603 "Locates all nodes satisfying PRED among ROOT's children. 604 Returns a list of the nodes or nil if none found." 605 (mapcar (lambda (node) (when (funcall pred node) node)) 606 (esxml-node-children root))) 607 608 (defun esxml--node-with-children (node children) 609 (let ((tag (esxml-node-tag node)) 610 (attributes (esxml-node-attributes node))) 611 (append (list tag attributes) children))) 612 613 (defun esxml--node-with-attributes (node attributes) 614 (let ((tag (esxml-node-tag node)) 615 (children (esxml-node-children node))) 616 (append (list tag attributes) children))) 617 618 (defun esxml-tree-map (function root) 619 "Returns a copy of ROOT with FUNCTION applied to each node." 620 (if (esxml-branch-p root) 621 (esxml--node-with-children 622 (funcall function root) 623 (mapcar (lambda (node) (esxml-tree-map function node)) 624 (esxml-node-children root))) 625 (funcall function root))) 626 627 (defvar esxml--symbol (make-symbol "id")) 628 629 (defun esxml--decorate-tree (root) 630 (let ((i 0)) 631 (esxml-tree-map 632 (lambda (node) 633 (let ((attribute (cons esxml--symbol i)) 634 (attributes (esxml-node-attributes node))) 635 (setq attributes (append (list attribute) attributes)) 636 (setq i (1+ i)) 637 (if (esxml-branch-p node) 638 (esxml--node-with-attributes node attributes) 639 node))) 640 root))) 641 642 (defun esxml--undecorate-node (node) 643 (if (esxml-branch-p node) 644 (let ((attributes (esxml-node-attributes node))) 645 (esxml--node-with-attributes node (assq-delete-all esxml--symbol 646 attributes))) 647 node)) 648 649 (defun esxml--retrieve-decoration (node) 650 (esxml-node-attribute esxml--symbol node)) 651 652 653 ;;; querying 654 655 ;; NOTE: supporting structural pseudo functions, direct siblings and 656 ;; indirect siblings requires breadth instead of depth traversal, 657 ;; something that could be emulated without zippers if you had the 658 ;; parent of the node (and the position of the child)... 659 660 (defun esxml--node-matches-attribute-p (node modifier) 661 (let ((attributes (esxml-node-attributes node)) 662 haystack) 663 (cl-every 664 (lambda (item) 665 (let ((type (car item)) 666 (value (cdr item))) 667 (cond 668 ((eq type 'name) 669 (let ((match (assq (intern value) attributes))) 670 (setq haystack (cdr match)) 671 match)) 672 ((eq type 'exact-match) 673 (equal haystack value)) 674 ((eq type 'prefix-match) 675 (string-prefix-p value haystack)) 676 ((eq type 'suffix-match) 677 (string-suffix-p value haystack)) 678 ((eq type 'substring-match) 679 (string-match-p (regexp-quote value) haystack)) 680 ((eq type 'include-match) 681 (member value (split-string haystack " "))) 682 ((eq type 'dash-match) 683 (or (equal value haystack) 684 (string-match-p (format "^%s-" (regexp-quote value)) haystack))) 685 (t (error "Unknown attribute modifier"))))) 686 modifier))) 687 688 (defun esxml--node-matches-modifier-p (node type value) 689 (cond 690 ((eq type 'wildcard) 691 t) 692 ((eq type 'tag) 693 (equal (esxml-node-tag node) value)) 694 ((eq type 'id) 695 (equal (esxml-node-attribute 'id node) value)) 696 ((eq type 'class) 697 (let ((class (esxml-node-attribute 'class node))) 698 (and class (member value (split-string class " "))))) 699 ((eq type 'attribute) 700 (esxml--node-matches-attribute-p node value)) 701 ;; TODO: support structural pseudo functions 702 ;; TODO: error out on invalid pseudo-class arguments 703 (t (error "Unimplemented attribute type: %s" type)))) 704 705 (defun esxml--find-node-for (attributes) 706 (lambda (node) 707 (cl-every 708 (lambda (attribute) 709 (let ((type (car attribute)) 710 (value (cdr attribute))) 711 (esxml--node-matches-modifier-p node type value))) 712 attributes))) 713 714 (defun esxml--find-nodes (root combinator attributes) 715 (let* ((type (cdr (assq 'combinator combinator))) 716 (walker (cond 717 ((not type) 718 'esxml-find-nodes) 719 ((eq type 'descendant) 720 'esxml-find-descendants) 721 ((eq type 'child) 722 'esxml-find-children) 723 ;; TODO: support direct sibling 724 ;; TODO: support indirect sibling 725 (t (error "Unimplemented combinator %s" combinator))))) 726 (funcall walker (esxml--find-node-for attributes) root))) 727 728 (defun esxml--query (selector root) 729 (let* ((attributes (pop selector)) 730 combinator 731 (result (esxml--find-nodes root nil attributes))) 732 (while (and result selector) 733 (setq combinator (pop selector)) 734 (setq attributes (pop selector)) 735 (setq result (cl-mapcan 736 (lambda (node) 737 (esxml--find-nodes node combinator attributes)) 738 result)) 739 (setq result (delq nil result))) 740 result)) 741 742 (defun esxml--delete-dups (items test) 743 (let ((seen (make-hash-table :test test)) 744 result) 745 (while items 746 (let ((item (pop items))) 747 (when (not (gethash item seen)) 748 (push item result) 749 (puthash item t seen)))) 750 (nreverse result))) 751 752 (defun esxml-query-all (selector root) 753 "Locates all nodes satisfying SELECTOR starting from ROOT. 754 SELECTOR must be a string containing a CSS selector or a parsed 755 CSS selector returned by `esxml-parse-css-selector'. Returns a 756 list of the nodes or nil if none found." 757 (when (stringp selector) 758 (setq selector (esxml-parse-css-selector selector))) 759 (if (= (length selector) 1) 760 ;; no commas, we can only get the same nodes repeatedly 761 (esxml--delete-dups (esxml--query (car selector) root) 'eq) 762 ;; commas, nodes might be the same *and* in the wrong order 763 (setq root (esxml--decorate-tree root)) 764 (let (result) 765 (while selector 766 (setq result (nconc result (esxml--query (pop selector) root)))) 767 (setq result (cl-sort result '< :key 'esxml--retrieve-decoration)) 768 (setq result (cl-delete-duplicates result :test '= 769 :key 'esxml--retrieve-decoration)) 770 (mapcar (lambda (node) (esxml--undecorate-node node)) result)))) 771 772 (defun esxml-query (selector root) 773 "Locates a node satisfying SELECTOR starting from ROOT. 774 SELECTOR must be a string containing a CSS selector or a parsed 775 CSS selector returned by `esxml-parse-css-selector'. Returns the 776 node or nil if none found." 777 ;; NOTE: you can do a bit less work (the savings decrease the more 778 ;; branches the query discards), but it's simpler and safer to just 779 ;; have the same algorithm for both entry points 780 (car (esxml-query-all selector root))) 781 782 (provide 'esxml-query) 783 ;;; esxml-query.el ends here