dotemacs

My Emacs configuration
git clone git://git.entf.net/dotemacs
Log | Files | Refs | LICENSE

parseclj-lex.el (19422B)


      1 ;;; parseclj-lex.el --- Clojure/EDN Lexer
      2 
      3 ;; Copyright (C) 2017-2021  Arne Brasseur
      4 
      5 ;; Author: Arne Brasseur <arne@arnebrasseur.net>
      6 
      7 ;; This file is not part of GNU Emacs.
      8 
      9 ;; This file is free software; you can redistribute it and/or modify
     10 ;; it under the terms of the GNU General Public License as published by
     11 ;; the Free Software Foundation; either version 3, or (at your option)
     12 ;; any later version.
     13 
     14 ;; This file is distributed in the hope that it will be useful,
     15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     17 ;; GNU General Public License for more details.
     18 
     19 ;; You should have received a copy of the GNU General Public License
     20 ;; along with GNU Emacs; see the file COPYING.  If not, write to
     21 ;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     22 ;; Boston, MA 02110-1301, USA.
     23 
     24 ;;; Commentary:
     25 
     26 ;; A reader for EDN data files and parser for Clojure source files.
     27 
     28 ;;; Code:
     29 
     30 (require 'parseclj-alist)
     31 
     32 (defvar parseclj-lex--leaf-tokens '(:whitespace
     33                                     :comment
     34                                     :symbolic-value
     35                                     :number
     36                                     :nil
     37                                     :true
     38                                     :false
     39                                     :symbol
     40                                     :keyword
     41                                     :string
     42                                     :regex
     43                                     :character)
     44   "Types of tokens that represent leaf nodes in the AST.")
     45 
     46 (defvar parseclj-lex--closing-tokens '(:rparen
     47                                        :rbracket
     48                                        :rbrace)
     49   "Types of tokens that mark the end of a non-atomic form.")
     50 
     51 (defvar parseclj-lex--prefix-tokens '(:quote
     52                                       :backquote
     53                                       :unquote
     54                                       :unquote-splice
     55                                       :discard
     56                                       :tag
     57                                       :reader-conditional
     58                                       :reader-conditional-splice
     59                                       :var
     60                                       :deref
     61                                       :map-prefix
     62                                       :eval)
     63   "Tokens that modify the form that follows.")
     64 
     65 (defvar parseclj-lex--prefix-2-tokens '(:metadata)
     66   "Tokens that modify the two forms that follow.")
     67 
     68 ;; Token interface
     69 
     70 (defun parseclj-lex-token (type form pos &rest attributes)
     71   "Create a lexer token with the specified attributes.
     72 
     73 Tokens at a mimimum have these attributes
     74 - TYPE: the type of token, like :whitespace or :lparen
     75 - FORM: the source form, a string
     76 - POS: the position in the input, starts from 1 (like point)
     77 
     78 Other ATTRIBUTES can be given as a flat list of key-value pairs."
     79   (apply #'parseclj-alist :token-type type :form form :pos pos attributes))
     80 
     81 (defun parseclj-lex-error-token (pos &optional error-type)
     82   "Create a lexer error token starting at POS.
     83 ERROR-TYPE is an optional keyword to attach to the created token,
     84 as the means for providing more information on the error."
     85   (apply #'parseclj-lex-token
     86          :lex-error
     87          (buffer-substring-no-properties pos (point))
     88          pos
     89          (when error-type
     90            (list :error-type error-type))))
     91 
     92 (defun parseclj-lex-token-p (token)
     93   "Is the given TOKEN a parseclj-lex TOKEN.
     94 
     95 A token is an association list with :token-type as its first key."
     96   (and (consp token)
     97        (consp (car token))
     98        (eq :token-type (caar token))))
     99 
    100 (defun parseclj-lex-token-type (token)
    101   "Get the type of TOKEN."
    102   (and (consp token)
    103        (cdr (assq :token-type token))))
    104 
    105 (defun parseclj-lex-token-form (token)
    106   "Get the form of TOKEN."
    107   (and (consp token)
    108        (cdr (assq :form token))))
    109 
    110 (defun parseclj-lex-leaf-token-p (token)
    111   "Return t if the given AST TOKEN is a leaf node."
    112   (member (parseclj-lex-token-type token) parseclj-lex--leaf-tokens))
    113 
    114 (defun parseclj-lex-closing-token-p (token)
    115   "Return t if the given ast TOKEN is a closing token."
    116   (member (parseclj-lex-token-type token) parseclj-lex--closing-tokens))
    117 
    118 (defun parseclj-lex-error-p (token)
    119   "Return t if the TOKEN represents a lexing error token."
    120   (eq (parseclj-lex-token-type token) :lex-error))
    121 
    122 ;; Elisp values from tokens
    123 
    124 (defun parseclj-lex--string-value (s)
    125   "Parse an EDN string S into a regular string.
    126 S goes through three transformations:
    127 - Escaped characters in S are transformed into Elisp escaped
    128   characters.
    129 - Unicode escaped characters are decoded into its corresponding
    130   unicode character counterpart.
    131 - Octal escaped characters are decoded into its corresponding
    132   character counterpart."
    133   (replace-regexp-in-string
    134    "\\\\o[0-8]\\{3\\}"
    135    (lambda (x)
    136      (make-string 1 (string-to-number (substring x 2) 8)))
    137    (replace-regexp-in-string
    138     "\\\\u[0-9a-fA-F]\\{4\\}"
    139     (lambda (x)
    140       (make-string 1 (string-to-number (substring x 2) 16)))
    141     (replace-regexp-in-string "\\\\[tbnrf'\"\\]"
    142                               (lambda (x)
    143                                 (let ((ch (elt x 1)))
    144                                   (cond
    145                                    ((eq ?t ch) "\t")
    146                                    ((eq ?f ch) "\f")
    147                                    ((eq ?\" ch) "\"")
    148                                    ((eq ?r ch) "\r")
    149                                    ((eq ?n ch) "\n")
    150                                    ((eq ?\\ ch) "\\\\")
    151                                    (t (substring x 1)))))
    152                               (substring s 1 -1)))))
    153 
    154 (defun parseclj-lex--character-value (c)
    155   "Parse an EDN character C into an Emacs Lisp character."
    156   (let ((first-char (elt c 1)))
    157     (cond
    158      ((equal c "\\newline") ?\n)
    159      ((equal c "\\return") ?\r)
    160      ((equal c "\\space") ?\ )
    161      ((equal c "\\tab") ?\t)
    162      ((eq first-char ?u) (string-to-number (substring c 2) 16))
    163      ((eq first-char ?o) (string-to-number (substring c 2) 8))
    164      (t first-char))))
    165 
    166 (defun parseclj-lex--leaf-token-value (token)
    167   "Parse the given leaf TOKEN to an Emacs Lisp value."
    168   (let ((token-type (parseclj-lex-token-type token)))
    169     (cond
    170      ((eq :number token-type) (string-to-number (alist-get :form token)))
    171      ((eq :nil token-type) nil)
    172      ((eq :true token-type) t)
    173      ((eq :false token-type) nil)
    174      ((eq :symbol token-type) (intern (alist-get :form token)))
    175      ((eq :keyword token-type) (intern (alist-get :form token)))
    176      ((eq :string token-type) (parseclj-lex--string-value (alist-get :form token)))
    177      ((eq :character token-type) (parseclj-lex--character-value (alist-get :form token)))
    178      ((eq :symbolic-value token-type) (intern (substring (alist-get :form token) 2))))))
    179 
    180 ;; Stream tokenization
    181 
    182 (defun parseclj-lex-at-whitespace-p ()
    183   "Return t if char at point is white space."
    184   (let ((char (char-after (point))))
    185     (or (equal char ?\ )
    186         (equal char ?\t)
    187         (equal char ?\n)
    188         (equal char ?\r)
    189         (equal char ?,))))
    190 
    191 (defun parseclj-lex-at-eof-p ()
    192   "Return t if point is at the end of file."
    193   (eq (point) (point-max)))
    194 
    195 (defun parseclj-lex-whitespace ()
    196   "Consume all consecutive white space as possible and return an :whitespace token."
    197   (let ((pos (point)))
    198     (while (parseclj-lex-at-whitespace-p)
    199       (right-char))
    200     (parseclj-lex-token :whitespace
    201                         (buffer-substring-no-properties pos (point))
    202                         pos)))
    203 
    204 (defun parseclj-lex-skip-digits ()
    205   "Skip all consecutive digits after point."
    206   (while (and (char-after (point))
    207               (<= ?0 (char-after (point)))
    208               (<= (char-after (point)) ?9))
    209     (right-char)))
    210 
    211 (defun parseclj-lex-skip-hex ()
    212   "Skip all consecutive hex digits after point."
    213   (while (and (char-after (point))
    214               (or (<= ?0 (char-after (point)) ?9)
    215                   (<= ?a (char-after (point)) ?f)
    216                   (<= ?A (char-after (point)) ?F)))
    217     (right-char)))
    218 
    219 (defun parseclj-lex-skip-number ()
    220   "Skip a number at point."
    221   ;; [\+\-]?\d+\.\d+
    222   (if (and (eq ?0 (char-after (point)))
    223            (eq ?x (char-after (1+ (point)))))
    224       (progn
    225         (right-char 2)
    226         (parseclj-lex-skip-hex))
    227     (progn
    228       (when (member (char-after (point)) '(?+ ?-))
    229         (right-char))
    230 
    231       (parseclj-lex-skip-digits)
    232 
    233       (when (eq (char-after (point)) ?.)
    234         (right-char))
    235 
    236       (parseclj-lex-skip-digits))))
    237 
    238 (defun parseclj-lex-number ()
    239   "Consume a number and return a `:number' token representing it."
    240   (let ((pos (point)))
    241     (parseclj-lex-skip-number)
    242 
    243     ;; 10110r2 or 4.3e+22
    244     (when (member (char-after (point)) '(?E ?e ?r))
    245       (right-char))
    246 
    247     (parseclj-lex-skip-number)
    248 
    249     ;; trailing M
    250     (when (eq (char-after (point)) ?M)
    251       (right-char))
    252 
    253     ;; trailing N clojure.lang.BigInt
    254     (when (eq (char-after (point)) ?N)
    255       (right-char))
    256 
    257     (let ((char (char-after (point))))
    258       (if (and char (or (and (<= ?a char) (<= char ?z))
    259                         (and (<= ?A char) (<= char ?Z))
    260                         (and (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?& ?= ?< ?> ?/)))))
    261           (progn
    262             (right-char)
    263             (parseclj-lex-error-token pos :invalid-number-format))
    264         (parseclj-lex-token :number
    265                             (buffer-substring-no-properties pos (point))
    266                             pos)))))
    267 
    268 
    269 (defun parseclj-lex-digit-p (char)
    270   "Return t if CHAR is a digit."
    271   (and char (<= ?0 char) (<= char ?9)))
    272 
    273 (defun parseclj-lex-at-number-p ()
    274   "Return t if point is at a number."
    275   (let ((char (char-after (point))))
    276     (or (parseclj-lex-digit-p char)
    277         (and (member char '(?- ?+ ?.))
    278              (parseclj-lex-digit-p (char-after (1+ (point))))))))
    279 
    280 (defun parseclj-lex-symbol-start-p (char &optional alpha-only)
    281   "Return t if CHAR is a valid start for a symbol.
    282 
    283 Symbols begin with a non-numeric character and can contain alphanumeric
    284 characters and . * + ! - _ ? $ % & = < > '.  If - + or . are the first
    285 character, the second character (if any) must be non-numeric.
    286 
    287 In some cases, like in tagged elements, symbols are required to start with
    288 alphabetic characters only.  ALPHA-ONLY ensures this behavior."
    289   (not (not (and char
    290                  (or (and (<= ?a char) (<= char ?z))
    291                      (and (<= ?A char) (<= char ?Z))
    292                      (and (not alpha-only) (member char '(?. ?* ?+ ?! ?- ?_ ?? ?$ ?% ?& ?= ?< ?> ?/ ?'))))))))
    293 
    294 (defun parseclj-lex-symbol-rest-p (char)
    295   "Return t if CHAR is a valid character in a symbol.
    296 For more information on what determines a valid symbol, see
    297 `parseclj-lex-symbol-start-p'"
    298   (or (parseclj-lex-symbol-start-p char)
    299       (parseclj-lex-digit-p char)
    300       (eq ?: char)
    301       (eq ?# char)))
    302 
    303 (defun parseclj-lex-get-symbol-at-point (pos)
    304   "Return the symbol at POS as a string."
    305   (while (parseclj-lex-symbol-rest-p (char-after (point)))
    306     (right-char))
    307   (buffer-substring-no-properties pos (point)))
    308 
    309 (defun parseclj-lex-symbol ()
    310   "Return a lex token representing a symbol.
    311 Because of their special meaning, symbols \"nil\", \"true\", and \"false\"
    312 are returned as their own lex tokens."
    313   (let ((pos (point)))
    314     (right-char)
    315     (let ((sym (parseclj-lex-get-symbol-at-point pos)))
    316       (cond
    317        ((equal sym "nil") (parseclj-lex-token :nil "nil" pos))
    318        ((equal sym "true") (parseclj-lex-token :true "true" pos))
    319        ((equal sym "false") (parseclj-lex-token :false "false" pos))
    320        (t (parseclj-lex-token :symbol sym pos))))))
    321 
    322 (defun parseclj-lex-string* ()
    323   "Helper for string/regex lexing.
    324 Returns either the string, or an error token"
    325   (let ((pos (point)))
    326     (right-char)
    327     (while (not (or (equal (char-after (point)) ?\") (parseclj-lex-at-eof-p)))
    328       (if (equal (char-after (point)) ?\\)
    329           (right-char 2)
    330         (right-char)))
    331     (when (equal (char-after (point)) ?\")
    332       (right-char)
    333       (buffer-substring-no-properties pos (point)))))
    334 
    335 (defun parseclj-lex-string ()
    336   "Return a lex token representing a string.
    337 If EOF is reached without finding a closing double quote, a :lex-error
    338 token is returned."
    339   (let ((pos (point))
    340         (str (parseclj-lex-string*)))
    341     (if str
    342         (parseclj-lex-token :string str pos)
    343       (parseclj-lex-error-token pos :invalid-string))))
    344 
    345 (defun parseclj-lex-regex ()
    346   "Return a lex token representing a regular expression.
    347 If EOF is reached without finding a closing double quote, a :lex-error
    348 token is returned."
    349   (let ((pos (1- (point)))
    350         (str (parseclj-lex-string*)))
    351     (if str
    352         (parseclj-lex-token :regex (concat "#" str) pos)
    353       (parseclj-lex-error-token pos :invalid-regex))))
    354 
    355 (defun parseclj-lex-lookahead (n)
    356   "Return a lookahead string of N characters after point."
    357   (buffer-substring-no-properties (point) (min (+ (point) n) (point-max))))
    358 
    359 (defun parseclj-lex-character ()
    360   "Return a lex token representing a character."
    361   (let ((pos (point)))
    362     (right-char)
    363     (cond
    364      ((equal (parseclj-lex-lookahead 3) "tab")
    365       (right-char 3)
    366       (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
    367 
    368      ((equal (parseclj-lex-lookahead 5) "space")
    369       (right-char 5)
    370       (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
    371 
    372      ((equal (parseclj-lex-lookahead 6) "return")
    373       (right-char 6)
    374       (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
    375 
    376      ((equal (parseclj-lex-lookahead 7) "newline")
    377       (right-char 7)
    378       (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
    379 
    380      ((string-match-p "^u[0-9a-fA-F]\\{4\\}" (parseclj-lex-lookahead 5))
    381       (right-char 5)
    382       (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
    383 
    384      ((string-match-p "^o[0-8]\\{3\\}" (parseclj-lex-lookahead 4))
    385       (right-char 4)
    386       (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos))
    387 
    388      (t
    389       (right-char)
    390       (parseclj-lex-token :character (buffer-substring-no-properties pos (point)) pos)))))
    391 
    392 (defun parseclj-lex-keyword ()
    393   "Return a lex token representing a keyword.
    394 Keywords follow the same rules as symbols, except they start with one or
    395 two colon characters.
    396 
    397 See `parseclj-lex-symbol', `parseclj-lex-symbol-start-p'."
    398   (let ((pos (point)))
    399     (right-char)
    400     (when (equal (char-after (point)) ?:) ;; same-namespace keyword
    401       (right-char))
    402     (if (equal (char-after (point)) ?:) ;; three colons in a row => lex-error
    403         (progn
    404           (right-char)
    405           (parseclj-lex-error-token pos :invalid-keyword))
    406       (progn
    407         (while (or (parseclj-lex-symbol-rest-p (char-after (point)))
    408                    (equal (char-after (point)) ?#))
    409           (right-char))
    410         (parseclj-lex-token :keyword (buffer-substring-no-properties pos (point)) pos)))))
    411 
    412 (defun parseclj-lex-comment ()
    413   "Return a lex token representing a comment."
    414   (let ((pos (point)))
    415     (goto-char (line-end-position))
    416     (when (equal (char-after (point)) ?\n)
    417       (right-char))
    418     (parseclj-lex-token :comment (buffer-substring-no-properties pos (point)) pos)))
    419 
    420 (defun parseclj-lex-map-prefix ()
    421   "Return a lex token representing a map prefix."
    422   (let ((pos (1- (point))))
    423     (right-char)
    424     (when (equal (char-after (point)) ?:)
    425       (right-char))
    426     (while (parseclj-lex-symbol-rest-p (char-after (point)))
    427       (right-char))
    428     (parseclj-lex-token :map-prefix (buffer-substring-no-properties pos (point)) pos)))
    429 
    430 (defun parseclj-lex-next ()
    431   "Consume characters at point and return the next lexical token.
    432 
    433 See `parseclj-lex-token'."
    434   (if (parseclj-lex-at-eof-p)
    435       (parseclj-lex-token :eof nil (point))
    436     (let ((char (char-after (point)))
    437           (pos  (point)))
    438       (cond
    439        ((parseclj-lex-at-whitespace-p)
    440         (parseclj-lex-whitespace))
    441 
    442        ((equal char ?\()
    443         (right-char)
    444         (parseclj-lex-token :lparen "(" pos))
    445 
    446        ((equal char ?\))
    447         (right-char)
    448         (parseclj-lex-token :rparen ")" pos))
    449 
    450        ((equal char ?\[)
    451         (right-char)
    452         (parseclj-lex-token :lbracket "[" pos))
    453 
    454        ((equal char ?\])
    455         (right-char)
    456         (parseclj-lex-token :rbracket "]" pos))
    457 
    458        ((equal char ?{)
    459         (right-char)
    460         (parseclj-lex-token :lbrace "{" pos))
    461 
    462        ((equal char ?})
    463         (right-char)
    464         (parseclj-lex-token :rbrace "}" pos))
    465 
    466        ((equal char ?')
    467         (right-char)
    468         (parseclj-lex-token :quote "'" pos))
    469 
    470        ((equal char ?`)
    471         (right-char)
    472         (parseclj-lex-token :backquote "`" pos))
    473 
    474        ((equal char ?~)
    475         (right-char)
    476         (if (eq ?@ (char-after (point)))
    477             (progn
    478               (right-char)
    479               (parseclj-lex-token :unquote-splice "~@" pos))
    480           (parseclj-lex-token :unquote "~" pos)))
    481 
    482        ((parseclj-lex-at-number-p)
    483         (parseclj-lex-number))
    484 
    485        ((parseclj-lex-symbol-start-p char)
    486         (parseclj-lex-symbol))
    487 
    488        ((equal char ?\")
    489         (parseclj-lex-string))
    490 
    491        ((equal char ?\\)
    492         (parseclj-lex-character))
    493 
    494        ((equal char ?:)
    495         (parseclj-lex-keyword))
    496 
    497        ((equal char ?\;)
    498         (parseclj-lex-comment))
    499 
    500        ((equal char ?^)
    501         (right-char)
    502         (parseclj-lex-token :metadata "^" pos))
    503 
    504        ((equal char ?@)
    505         (right-char)
    506         (parseclj-lex-token :deref "@" pos))
    507 
    508        ((equal char ?#)
    509         (right-char)
    510         (let ((char (char-after (point))))
    511           (cond
    512            ((equal char ?{)
    513             (right-char)
    514             (parseclj-lex-token :set "#{" pos))
    515            ((equal char ?_)
    516             (right-char)
    517             (parseclj-lex-token :discard "#_" pos))
    518            ((equal char ?\()
    519             (right-char)
    520             (parseclj-lex-token :lambda "#(" pos))
    521            ((equal char ?')
    522             (right-char)
    523             (parseclj-lex-token :var "#'" pos))
    524            ((equal char ?=)
    525             (right-char)
    526             (parseclj-lex-token :eval "#=" pos))
    527            ((equal char ?#)
    528             (right-char)
    529             (let ((sym (parseclj-lex-get-symbol-at-point (point))))
    530               (parseclj-lex-token :symbolic-value (concat "##" sym) pos)))
    531            ((equal char ?\")
    532             (parseclj-lex-regex))
    533            ((equal char ?:)
    534             (parseclj-lex-map-prefix))
    535            ((equal char ?\?)
    536             (right-char)
    537             (if (eq ?@ (char-after (point)))
    538                 (progn
    539                   (right-char)
    540                   (parseclj-lex-token :reader-conditional-splice "#?@" pos))
    541               (parseclj-lex-token :reader-conditional "#?" pos)))
    542            ((parseclj-lex-symbol-start-p char t)
    543             (right-char)
    544             (parseclj-lex-token :tag (concat "#" (parseclj-lex-get-symbol-at-point (1+ pos))) pos))
    545            ((equal char ?!) ;; shebang
    546             (left-char)
    547             (parseclj-lex-comment))
    548            (t
    549             (while (not (or (parseclj-lex-at-whitespace-p)
    550                             (parseclj-lex-at-eof-p)))
    551               (right-char))
    552             (parseclj-lex-error-token pos :invalid-hashtag-dispatcher)))))
    553 
    554        (t
    555         (progn
    556           (right-char)
    557           (parseclj-lex-error-token pos)))))))
    558 
    559 (provide 'parseclj-lex)
    560 
    561 ;;; parseclj-lex.el ends here