parseclj-lex.el (19422B)
1 ;;; parseclj-lex.el --- Clojure/EDN Lexer 2 3 ;; Copyright (C) 2017-2021 Arne Brasseur 4 5 ;; Author: Arne Brasseur <arne@arnebrasseur.net> 6 7 ;; This file is not part of GNU Emacs. 8 9 ;; This file is free software; you can redistribute it and/or modify 10 ;; it under the terms of the GNU General Public License as published by 11 ;; the Free Software Foundation; either version 3, or (at your option) 12 ;; any later version. 13 14 ;; This file is distributed in the hope that it will be useful, 15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of 16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 ;; GNU General Public License for more details. 18 19 ;; You should have received a copy of the GNU General Public License 20 ;; along with GNU Emacs; see the file COPYING. If not, write to 21 ;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 22 ;; Boston, MA 02110-1301, USA. 23 24 ;;; Commentary: 25 26 ;; A reader for EDN data files and parser for Clojure source files. 27 28 ;;; Code: 29 30 (require 'parseclj-alist) 31 32 (defvar parseclj-lex--leaf-tokens '(:whitespace 33 :comment 34 :symbolic-value 35 :number 36 :nil 37 :true 38 :false 39 :symbol 40 :keyword 41 :string 42 :regex 43 :character) 44 "Types of tokens that represent leaf nodes in the AST.") 45 46 (defvar parseclj-lex--closing-tokens '(:rparen 47 :rbracket 48 :rbrace) 49 "Types of tokens that mark the end of a non-atomic form.") 50 51 (defvar parseclj-lex--prefix-tokens '(:quote 52 :backquote 53 :unquote 54 :unquote-splice 55 :discard 56 :tag 57 :reader-conditional 58 :reader-conditional-splice 59 :var 60 :deref 61 :map-prefix 62 :eval) 63 "Tokens that modify the form that follows.") 64 65 (defvar parseclj-lex--prefix-2-tokens '(:metadata) 66 "Tokens that modify the two forms that follow.") 67 68 ;; Token interface 69 70 (defun parseclj-lex-token (type form pos &rest attributes) 71 "Create a lexer token with the specified attributes. 72 73 Tokens at a mimimum have these attributes 74 - TYPE: the type of token, like :whitespace or :lparen 75 - FORM: the source form, a string 76 - POS: the position in the input, starts from 1 (like point) 77 78 Other ATTRIBUTES can be given as a flat list of key-value pairs." 79 (apply #'parseclj-alist :token-type type :form form :pos pos attributes)) 80 81 (defun parseclj-lex-error-token (pos &optional error-type) 82 "Create a lexer error token starting at POS. 83 ERROR-TYPE is an optional keyword to attach to the created token, 84 as the means for providing more information on the error." 85 (apply #'parseclj-lex-token 86 :lex-error 87 (buffer-substring-no-properties pos (point)) 88 pos 89 (when error-type 90 (list :error-type error-type)))) 91 92 (defun parseclj-lex-token-p (token) 93 "Is the given TOKEN a parseclj-lex TOKEN. 94 95 A token is an association list with :token-type as its first key." 96 (and (consp token) 97 (consp (car token)) 98 (eq :token-type (caar token)))) 99 100 (defun parseclj-lex-token-type (token) 101 "Get the type of TOKEN." 102 (and (consp token) 103 (cdr (assq :token-type token)))) 104 105 (defun parseclj-lex-token-form (token) 106 "Get the form of TOKEN." 107 (and (consp token) 108 (cdr (assq :form token)))) 109 110 (defun parseclj-lex-leaf-token-p (token) 111 "Return t if the given AST TOKEN is a leaf node." 112 (member (parseclj-lex-token-type token) parseclj-lex--leaf-tokens)) 113 114 (defun parseclj-lex-closing-token-p (token) 115 "Return t if the given ast TOKEN is a closing token." 116 (member (parseclj-lex-token-type token) parseclj-lex--closing-tokens)) 117 118 (defun parseclj-lex-error-p (token) 119 "Return t if the TOKEN represents a lexing error token." 120 (eq (parseclj-lex-token-type token) :lex-error)) 121 122 ;; Elisp values from tokens 123 124 (defun parseclj-lex--string-value (s) 125 "Parse an EDN string S into a regular string. 126 S goes through three transformations: 127 - Escaped characters in S are transformed into Elisp escaped 128 characters. 129 - Unicode escaped characters are decoded into its corresponding 130 unicode character counterpart. 131 - Octal escaped characters are decoded into its corresponding 132 character counterpart." 133 (replace-regexp-in-string 134 "\\\\o[0-8]\\{3\\}" 135 (lambda (x) 136 (make-string 1 (string-to-number (substring x 2) 8))) 137 (replace-regexp-in-string 138 "\\\\u[0-9a-fA-F]\\{4\\}" 139 (lambda (x) 140 (make-string 1 (string-to-number (substring x 2) 16))) 141 (replace-regexp-in-string "\\\\[tbnrf'\"\\]" 142 (lambda (x) 143 (let ((ch (elt x 1))) 144 (cond 145 ((eq ?t ch) "\t") 146 ((eq ?f ch) "\f") 147 ((eq ?\" ch) "\"") 148 ((eq ?r ch) "\r") 149 ((eq ?n ch) "\n") 150 ((eq ?\\ ch) "\\\\") 151 (t (substring x 1))))) 152 (substring s 1 -1))))) 153 154 (defun parseclj-lex--character-value (c) 155 "Parse an EDN character C into an Emacs Lisp character." 156 (let ((first-char (elt c 1))) 157 (cond 158 ((equal c "\\newline") ?\n) 159 ((equal c "\\return") ?\r) 160 ((equal c "\\space") ?\ ) 161 ((equal c "\\tab") ?\t) 162 ((eq first-char ?u) (string-to-number (substring c 2) 16)) 163 ((eq first-char ?o) (string-to-number (substring c 2) 8)) 164 (t first-char)))) 165 166 (defun parseclj-lex--leaf-token-value (token) 167 "Parse the given leaf TOKEN to an Emacs Lisp value." 168 (let ((token-type (parseclj-lex-token-type token))) 169 (cond 170 ((eq :number token-type) (string-to-number (alist-get :form token))) 171 ((eq :nil token-type) nil) 172 ((eq :true token-type) t) 173 ((eq :false token-type) nil) 174 ((eq :symbol token-type) (intern (alist-get :form token))) 175 ((eq :keyword token-type) (intern (alist-get :form token))) 176 ((eq :string token-type) (parseclj-lex--string-value (alist-get :form token))) 177 ((eq :character token-type) (parseclj-lex--character-value (alist-get :form token))) 178 ((eq :symbolic-value token-type) (intern (substring (alist-get :form token) 2)))))) 179 180 ;; Stream tokenization 181 182 (defun parseclj-lex-at-whitespace-p () 183 "Return t if char at point is white space." 184 (let ((char (char-after (point)))) 185 (or (equal char ?\ ) 186 (equal char ?\t) 187 (equal char ?\n) 188 (equal char ?\r) 189 (equal char ?,)))) 190 191 (defun parseclj-lex-at-eof-p () 192 "Return t if point is at the end of file." 193 (eq (point) (point-max))) 194 195 (defun parseclj-lex-whitespace () 196 "Consume all consecutive white space as possible and return an :whitespace token." 197 (let ((pos (point))) 198 (while (parseclj-lex-at-whitespace-p) 199 (right-char)) 200 (parseclj-lex-token :whitespace 201 (buffer-substring-no-properties pos (point)) 202 pos))) 203 204 (defun parseclj-lex-skip-digits () 205 "Skip all consecutive digits after point." 206 (while (and (char-after (point)) 207 (<= ?0 (char-after (point))) 208 (<= (char-after (point)) ?9)) 209 (right-char))) 210 211 (defun parseclj-lex-skip-hex () 212 "Skip all consecutive hex digits after point." 213 (while (and (char-after (point)) 214 (or (<= ?0 (char-after (point)) ?9) 215 (<= ?a (char-after (point)) ?f) 216 (<= ?A (char-after (point)) ?F))) 217 (right-char))) 218 219 (defun parseclj-lex-skip-number () 220 "Skip a number at point." 221 ;; [\+\-]?\d+\.\d+ 222 (if (and (eq ?0 (char-after (point))) 223 (eq ?x (char-after (1+ (point))))) 224 (progn 225 (right-char 2) 226 (parseclj-lex-skip-hex)) 227 (progn 228 (when (member (char-after (point)) '(?+ ?-)) 229 (right-char)) 230 231 (parseclj-lex-skip-digits) 232 233 (when (eq (char-after (point)) ?.) 234 (right-char)) 235 236 (parseclj-lex-skip-digits)))) 237 238 (defun parseclj-lex-number () 239 "Consume a number and return a `:number' token representing it." 240 (let ((pos (point))) 241 (parseclj-lex-skip-number) 242 243 ;; 10110r2 or 4.3e+22 244 (when (member (char-after (point)) '(?E ?e ?r)) 245 (right-char)) 246 247 (parseclj-lex-skip-number) 248 249 ;; trailing M 250 (when (eq (char-after (point)) ?M) 251 (right-char)) 252 253 ;; trailing N clojure.lang.BigInt 254 (when (eq (char-after (point)) ?N) 255 (right-char)) 256 257 (let ((char (char-after (point)))) 258 (if (and char (or (and (<= ?a char) (<= char ?z)) 259 (and (<= ?A char) (<= char ?Z)) 260 (and (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?& ?= ?< ?> ?/))))) 261 (progn 262 (right-char) 263 (parseclj-lex-error-token pos :invalid-number-format)) 264 (parseclj-lex-token :number 265 (buffer-substring-no-properties pos (point)) 266 pos))))) 267 268 269 (defun parseclj-lex-digit-p (char) 270 "Return t if CHAR is a digit." 271 (and char (<= ?0 char) (<= char ?9))) 272 273 (defun parseclj-lex-at-number-p () 274 "Return t if point is at a number." 275 (let ((char (char-after (point)))) 276 (or (parseclj-lex-digit-p char) 277 (and (member char '(?- ?+ ?.)) 278 (parseclj-lex-digit-p (char-after (1+ (point)))))))) 279 280 (defun parseclj-lex-symbol-start-p (char &optional alpha-only) 281 "Return t if CHAR is a valid start for a symbol. 282 283 Symbols begin with a non-numeric character and can contain alphanumeric 284 characters and . * + ! - _ ? $ % & = < > '. If - + or . are the first 285 character, the second character (if any) must be non-numeric. 286 287 In some cases, like in tagged elements, symbols are required to start with 288 alphabetic characters only. ALPHA-ONLY ensures this behavior." 289 (not (not (and char 290 (or (and (<= ?a char) (<= char ?z)) 291 (and (<= ?A char) (<= char ?Z)) 292 (and (not alpha-only) (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?% ?& ?= ?< ?> ?/ ?')))))))) 293 294 (defun parseclj-lex-symbol-rest-p (char) 295 "Return t if CHAR is a valid character in a symbol. 296 For more information on what determines a valid symbol, see 297 `parseclj-lex-symbol-start-p'" 298 (or (parseclj-lex-symbol-start-p char) 299 (parseclj-lex-digit-p char) 300 (eq ?: char) 301 (eq ?# char))) 302 303 (defun parseclj-lex-get-symbol-at-point (pos) 304 "Return the symbol at POS as a string." 305 (while (parseclj-lex-symbol-rest-p (char-after (point))) 306 (right-char)) 307 (buffer-substring-no-properties pos (point))) 308 309 (defun parseclj-lex-symbol () 310 "Return a lex token representing a symbol. 311 Because of their special meaning, symbols \"nil\", \"true\", and \"false\" 312 are returned as their own lex tokens." 313 (let ((pos (point))) 314 (right-char) 315 (let ((sym (parseclj-lex-get-symbol-at-point pos))) 316 (cond 317 ((equal sym "nil") (parseclj-lex-token :nil "nil" pos)) 318 ((equal sym "true") (parseclj-lex-token :true "true" pos)) 319 ((equal sym "false") (parseclj-lex-token :false "false" pos)) 320 (t (parseclj-lex-token :symbol sym pos)))))) 321 322 (defun parseclj-lex-string* () 323 "Helper for string/regex lexing. 324 Returns either the string, or an error token" 325 (let ((pos (point))) 326 (right-char) 327 (while (not (or (equal (char-after (point)) ?\") (parseclj-lex-at-eof-p))) 328 (if (equal (char-after (point)) ?\\) 329 (right-char 2) 330 (right-char))) 331 (when (equal (char-after (point)) ?\") 332 (right-char) 333 (buffer-substring-no-properties pos (point))))) 334 335 (defun parseclj-lex-string () 336 "Return a lex token representing a string. 337 If EOF is reached without finding a closing double quote, a :lex-error 338 token is returned." 339 (let ((pos (point)) 340 (str (parseclj-lex-string*))) 341 (if str 342 (parseclj-lex-token :string str pos) 343 (parseclj-lex-error-token pos :invalid-string)))) 344 345 (defun parseclj-lex-regex () 346 "Return a lex token representing a regular expression. 347 If EOF is reached without finding a closing double quote, a :lex-error 348 token is returned." 349 (let ((pos (1- (point))) 350 (str (parseclj-lex-string*))) 351 (if str 352 (parseclj-lex-token :regex (concat "#" str) pos) 353 (parseclj-lex-error-token pos :invalid-regex)))) 354 355 (defun parseclj-lex-lookahead (n) 356 "Return a lookahead string of N characters after point." 357 (buffer-substring-no-properties (point) (min (+ (point) n) (point-max)))) 358 359 (defun parseclj-lex-character () 360 "Return a lex token representing a character." 361 (let ((pos (point))) 362 (right-char) 363 (cond 364 ((equal (parseclj-lex-lookahead 3) "tab") 365 (right-char 3) 366 (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) 367 368 ((equal (parseclj-lex-lookahead 5) "space") 369 (right-char 5) 370 (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) 371 372 ((equal (parseclj-lex-lookahead 6) "return") 373 (right-char 6) 374 (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) 375 376 ((equal (parseclj-lex-lookahead 7) "newline") 377 (right-char 7) 378 (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) 379 380 ((string-match-p "^u[0-9a-fA-F]\\{4\\}" (parseclj-lex-lookahead 5)) 381 (right-char 5) 382 (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) 383 384 ((string-match-p "^o[0-8]\\{3\\}" (parseclj-lex-lookahead 4)) 385 (right-char 4) 386 (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)) 387 388 (t 389 (right-char) 390 (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))))) 391 392 (defun parseclj-lex-keyword () 393 "Return a lex token representing a keyword. 394 Keywords follow the same rules as symbols, except they start with one or 395 two colon characters. 396 397 See `parseclj-lex-symbol', `parseclj-lex-symbol-start-p'." 398 (let ((pos (point))) 399 (right-char) 400 (when (equal (char-after (point)) ?:) ;; same-namespace keyword 401 (right-char)) 402 (if (equal (char-after (point)) ?:) ;; three colons in a row => lex-error 403 (progn 404 (right-char) 405 (parseclj-lex-error-token pos :invalid-keyword)) 406 (progn 407 (while (or (parseclj-lex-symbol-rest-p (char-after (point))) 408 (equal (char-after (point)) ?#)) 409 (right-char)) 410 (parseclj-lex-token :keyword (buffer-substring-no-properties pos (point)) pos))))) 411 412 (defun parseclj-lex-comment () 413 "Return a lex token representing a comment." 414 (let ((pos (point))) 415 (goto-char (line-end-position)) 416 (when (equal (char-after (point)) ?\n) 417 (right-char)) 418 (parseclj-lex-token :comment (buffer-substring-no-properties pos (point)) pos))) 419 420 (defun parseclj-lex-map-prefix () 421 "Return a lex token representing a map prefix." 422 (let ((pos (1- (point)))) 423 (right-char) 424 (when (equal (char-after (point)) ?:) 425 (right-char)) 426 (while (parseclj-lex-symbol-rest-p (char-after (point))) 427 (right-char)) 428 (parseclj-lex-token :map-prefix (buffer-substring-no-properties pos (point)) pos))) 429 430 (defun parseclj-lex-next () 431 "Consume characters at point and return the next lexical token. 432 433 See `parseclj-lex-token'." 434 (if (parseclj-lex-at-eof-p) 435 (parseclj-lex-token :eof nil (point)) 436 (let ((char (char-after (point))) 437 (pos (point))) 438 (cond 439 ((parseclj-lex-at-whitespace-p) 440 (parseclj-lex-whitespace)) 441 442 ((equal char ?\() 443 (right-char) 444 (parseclj-lex-token :lparen "(" pos)) 445 446 ((equal char ?\)) 447 (right-char) 448 (parseclj-lex-token :rparen ")" pos)) 449 450 ((equal char ?\[) 451 (right-char) 452 (parseclj-lex-token :lbracket "[" pos)) 453 454 ((equal char ?\]) 455 (right-char) 456 (parseclj-lex-token :rbracket "]" pos)) 457 458 ((equal char ?{) 459 (right-char) 460 (parseclj-lex-token :lbrace "{" pos)) 461 462 ((equal char ?}) 463 (right-char) 464 (parseclj-lex-token :rbrace "}" pos)) 465 466 ((equal char ?') 467 (right-char) 468 (parseclj-lex-token :quote "'" pos)) 469 470 ((equal char ?`) 471 (right-char) 472 (parseclj-lex-token :backquote "`" pos)) 473 474 ((equal char ?~) 475 (right-char) 476 (if (eq ?@ (char-after (point))) 477 (progn 478 (right-char) 479 (parseclj-lex-token :unquote-splice "~@" pos)) 480 (parseclj-lex-token :unquote "~" pos))) 481 482 ((parseclj-lex-at-number-p) 483 (parseclj-lex-number)) 484 485 ((parseclj-lex-symbol-start-p char) 486 (parseclj-lex-symbol)) 487 488 ((equal char ?\") 489 (parseclj-lex-string)) 490 491 ((equal char ?\\) 492 (parseclj-lex-character)) 493 494 ((equal char ?:) 495 (parseclj-lex-keyword)) 496 497 ((equal char ?\;) 498 (parseclj-lex-comment)) 499 500 ((equal char ?^) 501 (right-char) 502 (parseclj-lex-token :metadata "^" pos)) 503 504 ((equal char ?@) 505 (right-char) 506 (parseclj-lex-token :deref "@" pos)) 507 508 ((equal char ?#) 509 (right-char) 510 (let ((char (char-after (point)))) 511 (cond 512 ((equal char ?{) 513 (right-char) 514 (parseclj-lex-token :set "#{" pos)) 515 ((equal char ?_) 516 (right-char) 517 (parseclj-lex-token :discard "#_" pos)) 518 ((equal char ?\() 519 (right-char) 520 (parseclj-lex-token :lambda "#(" pos)) 521 ((equal char ?') 522 (right-char) 523 (parseclj-lex-token :var "#'" pos)) 524 ((equal char ?=) 525 (right-char) 526 (parseclj-lex-token :eval "#=" pos)) 527 ((equal char ?#) 528 (right-char) 529 (let ((sym (parseclj-lex-get-symbol-at-point (point)))) 530 (parseclj-lex-token :symbolic-value (concat "##" sym) pos))) 531 ((equal char ?\") 532 (parseclj-lex-regex)) 533 ((equal char ?:) 534 (parseclj-lex-map-prefix)) 535 ((equal char ?\?) 536 (right-char) 537 (if (eq ?@ (char-after (point))) 538 (progn 539 (right-char) 540 (parseclj-lex-token :reader-conditional-splice "#?@" pos)) 541 (parseclj-lex-token :reader-conditional "#?" pos))) 542 ((parseclj-lex-symbol-start-p char t) 543 (right-char) 544 (parseclj-lex-token :tag (concat "#" (parseclj-lex-get-symbol-at-point (1+ pos))) pos)) 545 ((equal char ?!) ;; shebang 546 (left-char) 547 (parseclj-lex-comment)) 548 (t 549 (while (not (or (parseclj-lex-at-whitespace-p) 550 (parseclj-lex-at-eof-p))) 551 (right-char)) 552 (parseclj-lex-error-token pos :invalid-hashtag-dispatcher))))) 553 554 (t 555 (progn 556 (right-char) 557 (parseclj-lex-error-token pos))))))) 558 559 (provide 'parseclj-lex) 560 561 ;;; parseclj-lex.el ends here